summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_crat.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.c1697
1 files changed, 1392 insertions, 305 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 5d85ff341385..4a7180b46b71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Copyright 2015-2017 Advanced Micro Devices, Inc.
+ * Copyright 2015-2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -25,8 +26,9 @@
#include "kfd_crat.h"
#include "kfd_priv.h"
#include "kfd_topology.h"
-#include "kfd_iommu.h"
+#include "amdgpu.h"
#include "amdgpu_amdkfd.h"
+#include "amdgpu_xgmi.h"
/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
* GPU processor ID are expressed with Bit[31]=1.
@@ -48,32 +50,23 @@ static inline unsigned int get_and_inc_gpu_processor_id(
return current_id;
}
-/* Static table to describe GPU Cache information */
-struct kfd_gpu_cache_info {
- uint32_t cache_size;
- uint32_t cache_level;
- uint32_t flags;
- /* Indicates how many Compute Units share this cache
- * Value = 1 indicates the cache is not shared
- */
- uint32_t num_cu_shared;
-};
static struct kfd_gpu_cache_info kaveri_cache_info[] = {
{
/* TCP L1 Cache per CU */
.cache_size = 16,
.cache_level = 1,
+ .cache_line_size = 64,
.flags = (CRAT_CACHE_FLAGS_ENABLED |
CRAT_CACHE_FLAGS_DATA_CACHE |
CRAT_CACHE_FLAGS_SIMD_CACHE),
.num_cu_shared = 1,
-
},
{
/* Scalar L1 Instruction Cache (in SQC module) per bank */
.cache_size = 16,
.cache_level = 1,
+ .cache_line_size = 64,
.flags = (CRAT_CACHE_FLAGS_ENABLED |
CRAT_CACHE_FLAGS_INST_CACHE |
CRAT_CACHE_FLAGS_SIMD_CACHE),
@@ -83,6 +76,7 @@ static struct kfd_gpu_cache_info kaveri_cache_info[] = {
/* Scalar L1 Data Cache (in SQC module) per bank */
.cache_size = 8,
.cache_level = 1,
+ .cache_line_size = 64,
.flags = (CRAT_CACHE_FLAGS_ENABLED |
CRAT_CACHE_FLAGS_DATA_CACHE |
CRAT_CACHE_FLAGS_SIMD_CACHE),
@@ -98,6 +92,7 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
/* TCP L1 Cache per CU */
.cache_size = 16,
.cache_level = 1,
+ .cache_line_size = 64,
.flags = (CRAT_CACHE_FLAGS_ENABLED |
CRAT_CACHE_FLAGS_DATA_CACHE |
CRAT_CACHE_FLAGS_SIMD_CACHE),
@@ -105,8 +100,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
},
{
/* Scalar L1 Instruction Cache (in SQC module) per bank */
- .cache_size = 8,
+ .cache_size = 32,
.cache_level = 1,
+ .cache_line_size = 64,
.flags = (CRAT_CACHE_FLAGS_ENABLED |
CRAT_CACHE_FLAGS_INST_CACHE |
CRAT_CACHE_FLAGS_SIMD_CACHE),
@@ -114,8 +110,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
},
{
/* Scalar L1 Data Cache (in SQC module) per bank. */
- .cache_size = 4,
+ .cache_size = 16,
.cache_level = 1,
+ .cache_line_size = 64,
.flags = (CRAT_CACHE_FLAGS_ENABLED |
CRAT_CACHE_FLAGS_DATA_CACHE |
CRAT_CACHE_FLAGS_SIMD_CACHE),
@@ -125,18 +122,901 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
/* TODO: Add L2 Cache information */
};
-/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
- * the following ASICs may need a separate table.
- */
#define hawaii_cache_info kaveri_cache_info
#define tonga_cache_info carrizo_cache_info
#define fiji_cache_info carrizo_cache_info
#define polaris10_cache_info carrizo_cache_info
#define polaris11_cache_info carrizo_cache_info
#define polaris12_cache_info carrizo_cache_info
-/* TODO - check & update Vega10 cache details */
-#define vega10_cache_info carrizo_cache_info
-#define raven_cache_info carrizo_cache_info
+#define vegam_cache_info carrizo_cache_info
+
+/* NOTE: L1 cache information has been updated and L2/L3
+ * cache information has been added for Vega10 and
+ * newer ASICs. The unit for cache_size is KiB.
+ * In future, check & update cache details
+ * for every new ASIC is required.
+ */
+
+static struct kfd_gpu_cache_info vega10_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 4096,
+ .cache_level = 2,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 16,
+ },
+};
+
+static struct kfd_gpu_cache_info raven_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 1024,
+ .cache_level = 2,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 11,
+ },
+};
+
+static struct kfd_gpu_cache_info renoir_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 1024,
+ .cache_level = 2,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+};
+
+static struct kfd_gpu_cache_info vega12_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 2048,
+ .cache_level = 2,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 5,
+ },
+};
+
+static struct kfd_gpu_cache_info vega20_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 3,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 8192,
+ .cache_level = 2,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 16,
+ },
+};
+
+static struct kfd_gpu_cache_info aldebaran_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 8192,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 14,
+ },
+};
+
+static struct kfd_gpu_cache_info navi10_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 4096,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+};
+
+static struct kfd_gpu_cache_info vangogh_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 1024,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+};
+
+static struct kfd_gpu_cache_info navi14_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 12,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 2048,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 12,
+ },
+};
+
+static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 4096,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+ {
+ /* L3 Data Cache per GPU */
+ .cache_size = 128*1024,
+ .cache_level = 3,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+};
+
+static struct kfd_gpu_cache_info navy_flounder_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 3072,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+ {
+ /* L3 Data Cache per GPU */
+ .cache_size = 96*1024,
+ .cache_level = 3,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 10,
+ },
+};
+
+static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 2048,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+ {
+ /* L3 Data Cache per GPU */
+ .cache_size = 32*1024,
+ .cache_level = 3,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+};
+
+static struct kfd_gpu_cache_info beige_goby_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 1024,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+ {
+ /* L3 Data Cache per GPU */
+ .cache_size = 16*1024,
+ .cache_level = 3,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 8,
+ },
+};
+
+static struct kfd_gpu_cache_info yellow_carp_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 6,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 2048,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 6,
+ },
+};
+
+static struct kfd_gpu_cache_info gfx1037_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 256,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+};
+
+static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 256,
+ .cache_level = 2,
+ .cache_line_size = 128,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+};
+
+static struct kfd_gpu_cache_info dummy_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache per SQC */
+ .cache_size = 32,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache per SQC */
+ .cache_size = 16,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* GL1 Data Cache per SA */
+ .cache_size = 128,
+ .cache_level = 1,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 6,
+ },
+ {
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ .cache_size = 2048,
+ .cache_level = 2,
+ .cache_line_size = 64,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 6,
+ },
+};
static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
struct crat_subtype_computeunit *cu)
@@ -318,8 +1198,12 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
props->cachelines_per_tag = cache->lines_per_tag;
props->cache_assoc = cache->associativity;
props->cache_latency = cache->cache_latency;
+
memcpy(props->sibling_map, cache->sibling_map,
- sizeof(props->sibling_map));
+ CRAT_SIBLINGMAP_SIZE);
+
+ /* set the sibling_map_size as 32 for CRAT from ACPI */
+ props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;
if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
props->cache_type |= HSA_CACHE_TYPE_DATA;
@@ -330,7 +1214,6 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
props->cache_type |= HSA_CACHE_TYPE_HSACU;
- dev->cache_count++;
dev->node_props.caches_count++;
list_add_tail(&props->list, &dev->cache_props);
@@ -372,7 +1255,7 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
props->weight = 20;
else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
- props->weight = 15;
+ props->weight = iolink->weight_xgmi;
else
props->weight = node_distance(id_from, id_to);
@@ -383,7 +1266,6 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
props->rec_transfer_size =
iolink->recommended_transfer_size;
- dev->io_link_count++;
dev->node_props.io_links_count++;
list_add_tail(&props->list, &dev->io_link_props);
break;
@@ -399,15 +1281,17 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
* table, add corresponded reversed direction link now.
*/
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
- to_dev = kfd_topology_device_by_proximity_domain(id_to);
+ to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
+ if (!props2)
+ return -ENOMEM;
+
props2->node_from = id_to;
props2->node_to = id_from;
props2->kobj = NULL;
- to_dev->io_link_count++;
to_dev->node_props.io_links_count++;
list_add_tail(&props2->list, &to_dev->io_link_props);
}
@@ -498,7 +1382,7 @@ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
num_nodes = crat_table->num_domains;
image_len = crat_table->length;
- pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
+ pr_debug("Parsing CRAT table with %d nodes\n", num_nodes);
for (node_id = 0; node_id < num_nodes; node_id++) {
top_dev = kfd_create_topology_device(device_list);
@@ -537,247 +1421,319 @@ err:
return ret;
}
-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
-static int fill_in_pcache(struct crat_subtype_cache *pcache,
- struct kfd_gpu_cache_info *pcache_info,
- struct kfd_cu_info *cu_info,
- int mem_available,
- int cu_bitmask,
- int cache_type, unsigned int cu_processor_id,
- int cu_block)
-{
- unsigned int cu_sibling_map_mask;
- int first_active_cu;
-
- /* First check if enough memory is available */
- if (sizeof(struct crat_subtype_cache) > mem_available)
- return -ENOMEM;
- cu_sibling_map_mask = cu_bitmask;
- cu_sibling_map_mask >>= cu_block;
- cu_sibling_map_mask &=
- ((1 << pcache_info[cache_type].num_cu_shared) - 1);
- first_active_cu = ffs(cu_sibling_map_mask);
+static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
+ bool cache_line_size_missing,
+ struct kfd_gpu_cache_info *pcache_info)
+{
+ struct amdgpu_device *adev = kdev->adev;
+ int i = 0;
+
+ /* TCP L1 Cache per CU */
+ if (adev->gfx.config.gc_tcp_l1_size) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size;
+ pcache_info[i].cache_level = 1;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2;
+ pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size;
+ if (cache_line_size_missing && !pcache_info[i].cache_line_size)
+ pcache_info[i].cache_line_size = 128;
+ i++;
+ }
+ /* Scalar L1 Instruction Cache per SQC */
+ if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
+ pcache_info[i].cache_size =
+ adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
+ pcache_info[i].cache_level = 1;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
+ pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size;
+ if (cache_line_size_missing && !pcache_info[i].cache_line_size)
+ pcache_info[i].cache_line_size = 128;
+ i++;
+ }
+ /* Scalar L1 Data Cache per SQC */
+ if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
+ pcache_info[i].cache_level = 1;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
+ pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size;
+ if (cache_line_size_missing && !pcache_info[i].cache_line_size)
+ pcache_info[i].cache_line_size = 64;
+ i++;
+ }
+ /* GL1 Data Cache per SA */
+ if (adev->gfx.config.gc_gl1c_per_sa &&
+ adev->gfx.config.gc_gl1c_size_per_instance) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa *
+ adev->gfx.config.gc_gl1c_size_per_instance;
+ pcache_info[i].cache_level = 1;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
+ if (cache_line_size_missing)
+ pcache_info[i].cache_line_size = 128;
+ i++;
+ }
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ if (adev->gfx.config.gc_gl2c_per_gpu) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu;
+ pcache_info[i].cache_level = 2;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
+ pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size;
+ if (cache_line_size_missing && !pcache_info[i].cache_line_size)
+ pcache_info[i].cache_line_size = 128;
+ i++;
+ }
+ /* L3 Data Cache per GPU */
+ if (adev->gmc.mall_size) {
+ pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
+ pcache_info[i].cache_level = 3;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
+ pcache_info[i].cache_line_size = 64;
+ i++;
+ }
+ return i;
+}
- /* CU could be inactive. In case of shared cache find the first active
- * CU. and incase of non-shared cache check if the CU is inactive. If
- * inactive active skip it
- */
- if (first_active_cu) {
- memset(pcache, 0, sizeof(struct crat_subtype_cache));
- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
- pcache->length = sizeof(struct crat_subtype_cache);
- pcache->flags = pcache_info[cache_type].flags;
- pcache->processor_id_low = cu_processor_id
- + (first_active_cu - 1);
- pcache->cache_level = pcache_info[cache_type].cache_level;
- pcache->cache_size = pcache_info[cache_type].cache_size;
-
- /* Sibling map is w.r.t processor_id_low, so shift out
- * inactive CU
- */
- cu_sibling_map_mask =
- cu_sibling_map_mask >> (first_active_cu - 1);
-
- pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
- pcache->sibling_map[1] =
- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
- pcache->sibling_map[2] =
- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
- pcache->sibling_map[3] =
- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
- return 0;
+static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev,
+ struct kfd_gpu_cache_info *pcache_info)
+{
+ struct amdgpu_device *adev = kdev->adev;
+ int i = 0;
+
+ /* TCP L1 Cache per CU */
+ if (adev->gfx.config.gc_tcp_size_per_cu) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu;
+ pcache_info[i].cache_level = 1;
+ /* Cacheline size not available in IP discovery for gc943,gc944 */
+ pcache_info[i].cache_line_size = 128;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = 1;
+ i++;
+ }
+ /* Scalar L1 Instruction Cache per SQC */
+ if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
+ pcache_info[i].cache_size =
+ adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
+ pcache_info[i].cache_level = 1;
+ pcache_info[i].cache_line_size = 64;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;
+ i++;
+ }
+ /* Scalar L1 Data Cache per SQC */
+ if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
+ pcache_info[i].cache_level = 1;
+ pcache_info[i].cache_line_size = 64;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;
+ i++;
+ }
+ /* L2 Data Cache per GPU (Total Tex Cache) */
+ if (adev->gfx.config.gc_tcc_size) {
+ pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size;
+ pcache_info[i].cache_level = 2;
+ pcache_info[i].cache_line_size = 128;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
+ i++;
+ }
+ /* L3 Data Cache per GPU */
+ if (adev->gmc.mall_size) {
+ pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
+ pcache_info[i].cache_level = 3;
+ pcache_info[i].cache_line_size = 64;
+ pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE);
+ pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
+ i++;
}
- return 1;
+ return i;
}
-/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
- * tables
- *
- * @kdev - [IN] GPU device
- * @gpu_processor_id - [IN] GPU processor ID to which these caches
- * associate
- * @available_size - [IN] Amount of memory available in pcache
- * @cu_info - [IN] Compute Unit info obtained from KGD
- * @pcache - [OUT] memory into which cache data is to be filled in.
- * @size_filled - [OUT] amount of data used up in pcache.
- * @num_of_entries - [OUT] number of caches added
- */
-static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
- int gpu_processor_id,
- int available_size,
- struct kfd_cu_info *cu_info,
- struct crat_subtype_cache *pcache,
- int *size_filled,
- int *num_of_entries)
+int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info)
{
- struct kfd_gpu_cache_info *pcache_info;
int num_of_cache_types = 0;
- int i, j, k;
- int ct = 0;
- int mem_available = available_size;
- unsigned int cu_processor_id;
- int ret;
+ bool cache_line_size_missing = false;
- switch (kdev->device_info->asic_family) {
+ switch (kdev->adev->asic_type) {
case CHIP_KAVERI:
- pcache_info = kaveri_cache_info;
+ *pcache_info = kaveri_cache_info;
num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
break;
case CHIP_HAWAII:
- pcache_info = hawaii_cache_info;
+ *pcache_info = hawaii_cache_info;
num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
break;
case CHIP_CARRIZO:
- pcache_info = carrizo_cache_info;
+ *pcache_info = carrizo_cache_info;
num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
break;
case CHIP_TONGA:
- pcache_info = tonga_cache_info;
+ *pcache_info = tonga_cache_info;
num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
break;
case CHIP_FIJI:
- pcache_info = fiji_cache_info;
+ *pcache_info = fiji_cache_info;
num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
break;
case CHIP_POLARIS10:
- pcache_info = polaris10_cache_info;
+ *pcache_info = polaris10_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
break;
case CHIP_POLARIS11:
- pcache_info = polaris11_cache_info;
+ *pcache_info = polaris11_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
break;
case CHIP_POLARIS12:
- pcache_info = polaris12_cache_info;
+ *pcache_info = polaris12_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
break;
- case CHIP_VEGA10:
- case CHIP_VEGA12:
- case CHIP_VEGA20:
- pcache_info = vega10_cache_info;
- num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
- break;
- case CHIP_RAVEN:
- pcache_info = raven_cache_info;
- num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+ case CHIP_VEGAM:
+ *pcache_info = vegam_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
break;
default:
- return -EINVAL;
- }
-
- *size_filled = 0;
- *num_of_entries = 0;
-
- /* For each type of cache listed in the kfd_gpu_cache_info table,
- * go through all available Compute Units.
- * The [i,j,k] loop will
- * if kfd_gpu_cache_info.num_cu_shared = 1
- * will parse through all available CU
- * If (kfd_gpu_cache_info.num_cu_shared != 1)
- * then it will consider only one CU from
- * the shared unit
- */
-
- for (ct = 0; ct < num_of_cache_types; ct++) {
- cu_processor_id = gpu_processor_id;
- for (i = 0; i < cu_info->num_shader_engines; i++) {
- for (j = 0; j < cu_info->num_shader_arrays_per_engine;
- j++) {
- for (k = 0; k < cu_info->num_cu_per_sh;
- k += pcache_info[ct].num_cu_shared) {
-
- ret = fill_in_pcache(pcache,
- pcache_info,
- cu_info,
- mem_available,
- cu_info->cu_bitmap[i][j],
- ct,
- cu_processor_id,
- k);
-
- if (ret < 0)
- break;
-
- if (!ret) {
- pcache++;
- (*num_of_entries)++;
- mem_available -=
- sizeof(*pcache);
- (*size_filled) +=
- sizeof(*pcache);
- }
-
- /* Move to next CU block */
- cu_processor_id +=
- pcache_info[ct].num_cu_shared;
- }
- }
+ switch (KFD_GC_VERSION(kdev)) {
+ case IP_VERSION(9, 0, 1):
+ *pcache_info = vega10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
+ break;
+ case IP_VERSION(9, 2, 1):
+ *pcache_info = vega12_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
+ break;
+ case IP_VERSION(9, 4, 0):
+ case IP_VERSION(9, 4, 1):
+ *pcache_info = vega20_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
+ break;
+ case IP_VERSION(9, 4, 2):
+ *pcache_info = aldebaran_cache_info;
+ num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
+ break;
+ case IP_VERSION(9, 4, 3):
+ case IP_VERSION(9, 4, 4):
+ case IP_VERSION(9, 5, 0):
+ num_of_cache_types =
+ kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd,
+ *pcache_info);
+ break;
+ case IP_VERSION(9, 1, 0):
+ case IP_VERSION(9, 2, 2):
+ *pcache_info = raven_cache_info;
+ num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+ break;
+ case IP_VERSION(9, 3, 0):
+ *pcache_info = renoir_cache_info;
+ num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
+ break;
+ case IP_VERSION(10, 1, 10):
+ case IP_VERSION(10, 1, 2):
+ case IP_VERSION(10, 1, 3):
+ case IP_VERSION(10, 1, 4):
+ *pcache_info = navi10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
+ break;
+ case IP_VERSION(10, 1, 1):
+ *pcache_info = navi14_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
+ break;
+ case IP_VERSION(10, 3, 0):
+ *pcache_info = sienna_cichlid_cache_info;
+ num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
+ break;
+ case IP_VERSION(10, 3, 2):
+ *pcache_info = navy_flounder_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
+ break;
+ case IP_VERSION(10, 3, 4):
+ *pcache_info = dimgrey_cavefish_cache_info;
+ num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
+ break;
+ case IP_VERSION(10, 3, 1):
+ *pcache_info = vangogh_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
+ break;
+ case IP_VERSION(10, 3, 5):
+ *pcache_info = beige_goby_cache_info;
+ num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
+ break;
+ case IP_VERSION(10, 3, 3):
+ *pcache_info = yellow_carp_cache_info;
+ num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
+ break;
+ case IP_VERSION(10, 3, 6):
+ *pcache_info = gc_10_3_6_cache_info;
+ num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
+ break;
+ case IP_VERSION(10, 3, 7):
+ *pcache_info = gfx1037_cache_info;
+ num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);
+ break;
+ case IP_VERSION(11, 0, 0):
+ case IP_VERSION(11, 0, 1):
+ case IP_VERSION(11, 0, 2):
+ case IP_VERSION(11, 0, 3):
+ case IP_VERSION(11, 0, 4):
+ case IP_VERSION(11, 5, 0):
+ case IP_VERSION(11, 5, 1):
+ case IP_VERSION(11, 5, 2):
+ case IP_VERSION(11, 5, 3):
+ /* Cacheline size not available in IP discovery for gc11.
+ * kfd_fill_gpu_cache_info_from_gfx_config to hard code it
+ */
+ cache_line_size_missing = true;
+ fallthrough;
+ case IP_VERSION(12, 0, 0):
+ case IP_VERSION(12, 0, 1):
+ num_of_cache_types =
+ kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd,
+ cache_line_size_missing,
+ *pcache_info);
+ break;
+ default:
+ *pcache_info = dummy_cache_info;
+ num_of_cache_types = ARRAY_SIZE(dummy_cache_info);
+ pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");
+ break;
}
}
-
- pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
-
- return 0;
-}
-
-/*
- * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
- * copies CRAT from ACPI (if available).
- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
- *
- * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then
- * crat_image will be NULL
- * @size: [OUT] size of crat_image
- *
- * Return 0 if successful else return error code
- */
-int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
-{
- struct acpi_table_header *crat_table;
- acpi_status status;
- void *pcrat_image;
-
- if (!crat_image)
- return -EINVAL;
-
- *crat_image = NULL;
-
- /* Fetch the CRAT table from ACPI */
- status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
- if (status == AE_NOT_FOUND) {
- pr_warn("CRAT table not found\n");
- return -ENODATA;
- } else if (ACPI_FAILURE(status)) {
- const char *err = acpi_format_exception(status);
-
- pr_err("CRAT table error: %s\n", err);
- return -EINVAL;
- }
-
- if (ignore_crat) {
- pr_info("CRAT table disabled by module option\n");
- return -ENODATA;
- }
-
- pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL);
- if (!pcrat_image)
- return -ENOMEM;
-
- *crat_image = pcrat_image;
- *size = crat_table->length;
-
- return 0;
+ return num_of_cache_types;
}
/* Memory required to create Virtual CRAT.
* Since there is no easy way to predict the amount of memory required, the
- * following amount are allocated for CPU and GPU Virtual CRAT. This is
+ * following amount is allocated for GPU Virtual CRAT. This is
* expected to cover all known conditions. But to be safe additional check
* is put in the code to ensure we don't overwrite.
*/
-#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE)
-#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE)
+#define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE)
/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
*
@@ -863,7 +1819,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
return 0;
}
-#if CONFIG_X86_64
+#ifdef CONFIG_X86_64
static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
uint32_t *num_entries,
struct crat_subtype_iolink *sub_type_hdr)
@@ -927,7 +1883,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
#endif
int ret = 0;
- if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU)
+ if (!pcrat_image)
return -EINVAL;
/* Fill in CRAT Header.
@@ -951,6 +1907,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
CRAT_OEMID_LENGTH);
memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
CRAT_OEMTABLEID_LENGTH);
+ acpi_put_table(acpi_table);
}
crat_table->total_entries = 0;
crat_table->num_domains = 0;
@@ -992,11 +1949,14 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
(struct crat_subtype_iolink *)sub_type_hdr);
if (ret < 0)
return ret;
- crat_table->length += (sub_type_hdr->length * entries);
- crat_table->total_entries += entries;
- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
- sub_type_hdr->length * entries);
+ if (entries) {
+ crat_table->length += (sub_type_hdr->length * entries);
+ crat_table->total_entries += entries;
+
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ sub_type_hdr->length * entries);
+ }
#else
pr_info("IO link not available for non x86 platforms\n");
#endif
@@ -1019,7 +1979,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
}
static int kfd_fill_gpu_memory_affinity(int *avail_size,
- struct kfd_dev *kdev, uint8_t type, uint64_t size,
+ struct kfd_node *kdev, uint8_t type, uint64_t size,
struct crat_subtype_memory *sub_type_hdr,
uint32_t proximity_domain,
const struct kfd_local_mem_info *local_mem_info)
@@ -1047,6 +2007,95 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size,
return 0;
}
+#ifdef CONFIG_ACPI_NUMA
+static void kfd_find_numa_node_in_srat(struct kfd_node *kdev)
+{
+ struct acpi_table_header *table_header = NULL;
+ struct acpi_subtable_header *sub_header = NULL;
+ unsigned long table_end, subtable_len;
+ u32 pci_id = pci_domain_nr(kdev->adev->pdev->bus) << 16 |
+ pci_dev_id(kdev->adev->pdev);
+ u32 bdf;
+ acpi_status status;
+ struct acpi_srat_cpu_affinity *cpu;
+ struct acpi_srat_generic_affinity *gpu;
+ int pxm = 0, max_pxm = 0;
+ int numa_node = NUMA_NO_NODE;
+ bool found = false;
+
+ /* Fetch the SRAT table from ACPI */
+ status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);
+ if (status == AE_NOT_FOUND) {
+ pr_warn("SRAT table not found\n");
+ return;
+ } else if (ACPI_FAILURE(status)) {
+ const char *err = acpi_format_exception(status);
+ pr_err("SRAT table error: %s\n", err);
+ return;
+ }
+
+ table_end = (unsigned long)table_header + table_header->length;
+
+ /* Parse all entries looking for a match. */
+ sub_header = (struct acpi_subtable_header *)
+ ((unsigned long)table_header +
+ sizeof(struct acpi_table_srat));
+ subtable_len = sub_header->length;
+
+ while (((unsigned long)sub_header) + subtable_len < table_end) {
+ /*
+ * If length is 0, break from this loop to avoid
+ * infinite loop.
+ */
+ if (subtable_len == 0) {
+ pr_err("SRAT invalid zero length\n");
+ break;
+ }
+
+ switch (sub_header->type) {
+ case ACPI_SRAT_TYPE_CPU_AFFINITY:
+ cpu = (struct acpi_srat_cpu_affinity *)sub_header;
+ pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |
+ cpu->proximity_domain_lo;
+ if (pxm > max_pxm)
+ max_pxm = pxm;
+ break;
+ case ACPI_SRAT_TYPE_GENERIC_AFFINITY:
+ gpu = (struct acpi_srat_generic_affinity *)sub_header;
+ bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |
+ *((u16 *)(&gpu->device_handle[2]));
+ if (bdf == pci_id) {
+ found = true;
+ numa_node = pxm_to_node(gpu->proximity_domain);
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (found)
+ break;
+
+ sub_header = (struct acpi_subtable_header *)
+ ((unsigned long)sub_header + subtable_len);
+ subtable_len = sub_header->length;
+ }
+
+ acpi_put_table(table_header);
+
+ /* Workaround bad cpu-gpu binding case */
+ if (found && (numa_node < 0 ||
+ numa_node > pxm_to_node(max_pxm)))
+ numa_node = 0;
+
+ if (numa_node != NUMA_NO_NODE)
+ set_dev_node(&kdev->adev->pdev->dev, numa_node);
+}
+#endif
+
+#define KFD_CRAT_INTRA_SOCKET_WEIGHT 13
+#define KFD_CRAT_XGMI_WEIGHT 15
+
/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
* to its NUMA node
* @avail_size: Available size in the memory
@@ -1057,7 +2106,7 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size,
* Return 0 if successful else return -ve value
*/
static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
- struct kfd_dev *kdev,
+ struct kfd_node *kdev,
struct crat_subtype_iolink *sub_type_hdr,
uint32_t proximity_domain)
{
@@ -1077,13 +2126,51 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
/* Fill in IOLINK subtype.
* TODO: Fill-in other fields of iolink subtype
*/
- sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
+ if (kdev->adev->gmc.xgmi.connected_to_cpu ||
+ (KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) &&
+ kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) ==
+ AMDGPU_PKG_TYPE_APU)) {
+ bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);
+ int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :
+ KFD_CRAT_INTRA_SOCKET_WEIGHT;
+ /*
+ * with host gpu xgmi link, host can access gpu memory whether
+ * or not pcie bar type is large, so always create bidirectional
+ * io link.
+ */
+ sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
+ sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
+ sub_type_hdr->weight_xgmi = weight;
+ if (ext_cpu) {
+ amdgpu_xgmi_get_bandwidth(kdev->adev, NULL,
+ AMDGPU_XGMI_BW_MODE_PER_LINK,
+ AMDGPU_XGMI_BW_UNIT_MBYTES,
+ &sub_type_hdr->minimum_bandwidth_mbs,
+ &sub_type_hdr->maximum_bandwidth_mbs);
+ } else {
+ sub_type_hdr->minimum_bandwidth_mbs = mem_bw;
+ sub_type_hdr->maximum_bandwidth_mbs = mem_bw;
+ }
+ } else {
+ sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
+ sub_type_hdr->minimum_bandwidth_mbs =
+ amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true);
+ sub_type_hdr->maximum_bandwidth_mbs =
+ amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false);
+ }
+
sub_type_hdr->proximity_domain_from = proximity_domain;
+
+#ifdef CONFIG_ACPI_NUMA
+ if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE &&
+ num_possible_nodes() > 1)
+ kfd_find_numa_node_in_srat(kdev);
+#endif
#ifdef CONFIG_NUMA
- if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
+ if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE)
sub_type_hdr->proximity_domain_to = 0;
else
- sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
+ sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node;
#else
sub_type_hdr->proximity_domain_to = 0;
#endif
@@ -1091,11 +2178,14 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
}
static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
- struct kfd_dev *kdev,
+ struct kfd_node *kdev,
+ struct kfd_node *peer_kdev,
struct crat_subtype_iolink *sub_type_hdr,
uint32_t proximity_domain_from,
uint32_t proximity_domain_to)
{
+ bool use_ta_info = kdev->kfd->num_nodes == 1;
+
*avail_size -= sizeof(struct crat_subtype_iolink);
if (*avail_size < 0)
return -ENOMEM;
@@ -1110,6 +2200,26 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
sub_type_hdr->proximity_domain_from = proximity_domain_from;
sub_type_hdr->proximity_domain_to = proximity_domain_to;
+
+ if (use_ta_info) {
+ sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *
+ amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);
+ amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev,
+ AMDGPU_XGMI_BW_MODE_PER_PEER,
+ AMDGPU_XGMI_BW_UNIT_MBYTES,
+ &sub_type_hdr->minimum_bandwidth_mbs,
+ &sub_type_hdr->maximum_bandwidth_mbs);
+ } else {
+ bool is_single_hop = kdev->kfd == peer_kdev->kfd;
+ int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :
+ (2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT;
+ int mem_bw = 819200;
+
+ sub_type_hdr->weight_xgmi = weight;
+ sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0;
+ sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0;
+ }
+
return 0;
}
@@ -1120,19 +2230,18 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
* [OUT] actual size of data filled in crat_image
*/
static int kfd_create_vcrat_image_gpu(void *pcrat_image,
- size_t *size, struct kfd_dev *kdev,
+ size_t *size, struct kfd_node *kdev,
uint32_t proximity_domain)
{
struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+ struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;
+ struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;
struct crat_subtype_generic *sub_type_hdr;
struct kfd_local_mem_info local_mem_info;
struct kfd_topology_device *peer_dev;
struct crat_subtype_computeunit *cu;
- struct kfd_cu_info cu_info;
int avail_size = *size;
uint32_t total_num_of_cu;
- int num_of_cache_entries = 0;
- int cache_mem_filled = 0;
uint32_t nid = 0;
int ret = 0;
@@ -1143,9 +2252,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* Modify length and total_entries as subunits are added.
*/
avail_size -= sizeof(struct crat_header);
- if (avail_size < 0)
- return -ENOMEM;
-
memset(crat_table, 0, sizeof(struct crat_header));
memcpy(&crat_table->signature, CRAT_SIGNATURE,
@@ -1159,9 +2265,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* First fill in the sub type header and then sub type data
*/
avail_size -= sizeof(struct crat_subtype_computeunit);
- if (avail_size < 0)
- return -ENOMEM;
-
sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
@@ -1174,29 +2277,23 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
cu->proximity_domain = proximity_domain;
- amdgpu_amdkfd_get_cu_info(kdev->kgd, &cu_info);
- cu->num_simd_per_cu = cu_info.simd_per_cu;
- cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
- cu->max_waves_simd = cu_info.max_waves_per_simd;
+ cu->num_simd_per_cu = cu_info->simd_per_cu;
+ cu->num_simd_cores = cu_info->simd_per_cu *
+ (cu_info->number / kdev->kfd->num_nodes);
+ cu->max_waves_simd = cu_info->max_waves_per_simd;
- cu->wave_front_size = cu_info.wave_front_size;
- cu->array_count = cu_info.num_shader_arrays_per_engine *
- cu_info.num_shader_engines;
- total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
+ cu->wave_front_size = cu_info->wave_front_size;
+ cu->array_count = gfx_info->max_sh_per_se *
+ gfx_info->max_shader_engines;
+ total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh);
cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
- cu->num_cu_per_array = cu_info.num_cu_per_sh;
- cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
- cu->num_banks = cu_info.num_shader_engines;
- cu->lds_size_in_kb = cu_info.lds_size;
+ cu->num_cu_per_array = gfx_info->max_cu_per_sh;
+ cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu;
+ cu->num_banks = gfx_info->max_shader_engines;
+ cu->lds_size_in_kb = cu_info->lds_size;
cu->hsa_capability = 0;
- /* Check if this node supports IOMMU. During parsing this flag will
- * translate to HSA_CAP_ATS_PRESENT
- */
- if (!kfd_iommu_check_device(kdev))
- cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
-
crat_table->length += sub_type_hdr->length;
crat_table->total_entries++;
@@ -1205,11 +2302,11 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* report the total FB size (public+private) as a single
* private heap.
*/
- amdgpu_amdkfd_get_local_mem_info(kdev->kgd, &local_mem_info);
+ local_mem_info = kdev->local_mem_info;
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
sub_type_hdr->length);
- if (debug_largebar)
+ if (kdev->adev->debug_largebar)
local_mem_info.local_mem_size_private = 0;
if (local_mem_info.local_mem_size_private == 0)
@@ -1233,31 +2330,12 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
crat_table->length += sizeof(struct crat_subtype_memory);
crat_table->total_entries++;
- /* TODO: Fill in cache information. This information is NOT readily
- * available in KGD
- */
- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
- sub_type_hdr->length);
- ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
- avail_size,
- &cu_info,
- (struct crat_subtype_cache *)sub_type_hdr,
- &cache_mem_filled,
- &num_of_cache_entries);
-
- if (ret < 0)
- return ret;
-
- crat_table->length += cache_mem_filled;
- crat_table->total_entries += num_of_cache_entries;
- avail_size -= cache_mem_filled;
-
/* Fill in Subtype: IO_LINKS
* Only direct links are added here which is Link from GPU to
- * to its NUMA node. Indirect links are added by userspace.
+ * its NUMA node. Indirect links are added by userspace.
*/
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
- cache_mem_filled);
+ sub_type_hdr->length);
ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
@@ -1276,18 +2354,20 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* (from other GPU to this GPU) will be added
* in kfd_parse_subtype_iolink.
*/
- if (kdev->hive_id) {
+ if (kdev->kfd->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
- peer_dev = kfd_topology_device_by_proximity_domain(nid);
+ peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);
if (!peer_dev->gpu)
continue;
- if (peer_dev->gpu->hive_id != kdev->hive_id)
+ if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
+ continue;
+ if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))
continue;
sub_type_hdr = (typeof(sub_type_hdr))(
(char *)sub_type_hdr +
sizeof(struct crat_subtype_iolink));
ret = kfd_fill_gpu_xgmi_link_to_gpu(
- &avail_size, kdev,
+ &avail_size, kdev, peer_dev->gpu,
(struct crat_subtype_iolink *)sub_type_hdr,
proximity_domain, nid);
if (ret < 0)
@@ -1315,39 +2395,46 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
* -- this option is not currently implemented.
* The assumption is that all AMD APUs will have CRAT
- * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
+ * @kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU
*
* Return 0 if successful else return -ve value
*/
int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
- int flags, struct kfd_dev *kdev,
+ int flags, struct kfd_node *kdev,
uint32_t proximity_domain)
{
void *pcrat_image = NULL;
- int ret = 0;
+ int ret = 0, num_nodes;
+ size_t dyn_size;
if (!crat_image)
return -EINVAL;
*crat_image = NULL;
- /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
- * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
- * all the current conditions. A check is put not to overwrite beyond
- * allocated size
+ /* Allocate the CPU Virtual CRAT size based on the number of online
+ * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image.
+ * This should cover all the current conditions. A check is put not
+ * to overwrite beyond allocated size for GPUs
*/
switch (flags) {
case COMPUTE_UNIT_CPU:
- pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
+ num_nodes = num_online_nodes();
+ dyn_size = sizeof(struct crat_header) +
+ num_nodes * (sizeof(struct crat_subtype_computeunit) +
+ sizeof(struct crat_subtype_memory) +
+ (num_nodes - 1) * sizeof(struct crat_subtype_iolink));
+ pcrat_image = kvmalloc(dyn_size, GFP_KERNEL);
if (!pcrat_image)
return -ENOMEM;
- *size = VCRAT_SIZE_FOR_CPU;
+ *size = dyn_size;
+ pr_debug("CRAT size is %ld", dyn_size);
ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
break;
case COMPUTE_UNIT_GPU:
if (!kdev)
return -EINVAL;
- pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
+ pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
if (!pcrat_image)
return -ENOMEM;
*size = VCRAT_SIZE_FOR_GPU;
@@ -1366,7 +2453,7 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
if (!ret)
*crat_image = pcrat_image;
else
- kfree(pcrat_image);
+ kvfree(pcrat_image);
return ret;
}
@@ -1379,5 +2466,5 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
*/
void kfd_destroy_crat_image(void *crat_image)
{
- kfree(crat_image);
+ kvfree(crat_image);
}