summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@nvidia.com>2025-10-23 15:22:33 -0300
committerJoerg Roedel <joerg.roedel@amd.com>2025-11-05 09:50:17 +0100
commit5448c1558f60d4051c90938f2878c6fb20e2982a (patch)
tree43ec2c7f56950c93dcbe70a7fe34f8da5e08b0ff
parentefa03dab7ce4ed786b131f412440e2fd45fba11f (diff)
iommupt: Add the Intel VT-d second stage page table format
The VT-d second stage format is almost the same as the x86 PAE format, except the bit encodings in the PTE are different and a few new PTE features, like force coherency are present. Among all the formats it is unique in not having a designated present bit. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 53,66 , 50,64 , 21.21 2^21, 59,70 , 56,67 , 16.16 2^30, 54,66 , 52,63 , 17.17 256*2^12, 384,524 , 337,516 , 34.34 256*2^21, 387,632 , 336,626 , 46.46 256*2^30, 376,629 , 323,623 , 48.48 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 67,86 , 63,84 , 25.25 2^21, 64,84 , 59,80 , 26.26 2^30, 59,78 , 56,74 , 24.24 256*2^12, 216,335 , 198,317 , 37.37 256*2^21, 245,350 , 232,344 , 32.32 256*2^30, 248,345 , 226,339 , 33.33 Cc: Tina Zhang <tina.zhang@intel.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Lu Baolu <baolu.lu@linux.intel.com> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
-rw-r--r--drivers/iommu/generic_pt/.kunitconfig1
-rw-r--r--drivers/iommu/generic_pt/Kconfig11
-rw-r--r--drivers/iommu/generic_pt/fmt/Makefile2
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_vtdss.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_vtdss.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/vtdss.h292
-rw-r--r--include/linux/generic_pt/common.h18
-rw-r--r--include/linux/generic_pt/iommu.h11
8 files changed, 366 insertions, 0 deletions
diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
index 2016c5e5ac0f..52ac9e661ffd 100644
--- a/drivers/iommu/generic_pt/.kunitconfig
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y
CONFIG_DEBUG_GENERIC_PT=y
CONFIG_IOMMU_PT=y
CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_VTDSS=y
CONFIG_IOMMU_PT_X86_64=y
CONFIG_IOMMU_PT_KUNIT_TEST=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index 6dcb771b3c58..79f65268f312 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -42,6 +42,16 @@ config IOMMU_PT_AMDV1
Selected automatically by an IOMMU driver that uses this format.
+config IOMMU_PT_VTDSS
+ tristate "IOMMU page table for Intel VT-d Second Stage"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
+ level Second Stage page table. It is similar to the X86_64 format with
+ 4K/2M/1G page sizes.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
config IOMMU_PT_X86_64
tristate "IOMMU page table for x86 64-bit, 4/5 levels"
depends on !GENERIC_ATOMIC64 # for cmpxchg64
@@ -57,6 +67,7 @@ config IOMMU_PT_KUNIT_TEST
depends on KUNIT
depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
+ depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
default KUNIT_ALL_TESTS
help
Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
index 5a3379107999..976b49ec97dc 100644
--- a/drivers/iommu/generic_pt/fmt/Makefile
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -3,6 +3,8 @@
iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
IOMMU_PT_KUNIT_TEST :=
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 000000000000..4a239bcaae2a
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 000000000000..f551711e2a33
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+ BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 000000000000..d9774848eb6f
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-d Second Stange 5/4 level page table
+ *
+ * This is described in
+ * Section "3.7 Second-Stage Translation"
+ * Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/SS-PTE - 0
+ * Directory/SS-PDE - 1
+ * Directory Ptr/SS-PDPTE - 2
+ * PML4/SS-PML4E - 3
+ * PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* SSPTPTR is 4k aligned and limited by HAW */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ VTDSS_FMT_R = BIT(0),
+ VTDSS_FMT_W = BIT(1),
+ VTDSS_FMT_A = BIT(8),
+ VTDSS_FMT_D = BIT(9),
+ VTDSS_FMT_SNP = BIT(11),
+ VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+ VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!entry)
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= VTDSS_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = VTDSS_FMT_R | VTDSS_FMT_W |
+ FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
+
+static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | VTDSS_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+ return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+ /* Bits marked Ignored in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(10);
+ case 1 ... 9:
+ return BIT_ULL((bitnr - 1) + 52);
+ case 10:
+ return BIT_ULL(63);
+ /* Some bits in 9-3 are available in some entries */
+ default:
+ if (__builtin_constant_p(bitnr))
+ BUILD_BUG();
+ else
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+ ->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ /*
+ * VTDSS does not have a present bit, so we tell if any entry is present
+ * by checking for R or W.
+ */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ pte |= VTDSS_FMT_R;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= VTDSS_FMT_W;
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+ pte |= VTDSS_FMT_SNP;
+
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+ !(iommu_prot & IOMMU_WRITE)) {
+ pr_err_ratelimited(
+ "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+ const struct pt_iommu_vtdss_cfg *cfg)
+{
+ struct pt_vtdss *table = &iommu_table->vtdss_pt;
+ unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+
+ if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
+ return -EOPNOTSUPP;
+ else if (vasz_lg2 > 48)
+ pt_top_set_level(&table->common, 4);
+ else if (vasz_lg2 > 39)
+ pt_top_set_level(&table->common, 3);
+ else if (vasz_lg2 > 30)
+ pt_top_set_level(&table->common, 2);
+ else
+ return -EOPNOTSUPP;
+ return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_vtdss_hw_info *info)
+{
+ info->ssptptr = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+ /*
+ * top_level = 2 = 3 level table aw=1
+ * top_level = 3 = 4 level table aw=2
+ * top_level = 4 = 5 level table aw=3
+ */
+ info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+ [0] = { .common.hw_max_vasz_lg2 = 39 },
+ [1] = { .common.hw_max_vasz_lg2 = 48 },
+ [2] = { .common.hw_max_vasz_lg2 = 57 },
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index 883069e32952..6a9a1acb5aad 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -157,6 +157,24 @@ enum {
PT_FEAT_AMDV1_FORCE_COHERENCE,
};
+struct pt_vtdss {
+ struct pt_common common;
+};
+
+enum {
+ /*
+ * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+ * snoop. This is set either at creation time or before the first map
+ * operation.
+ */
+ PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START,
+ /*
+ * Prevent creating read-only PTEs. Used to work around HW errata
+ * ERRATA_772415_SPR17.
+ */
+ PT_FEAT_VTDSS_FORCE_WRITEABLE,
+};
+
struct pt_x86_64 {
struct pt_common common;
};
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 21132e342a79..cfe05a77f86b 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt);
struct pt_iommu_amdv1_mock_hw_info;
IOMMU_PROTOTYPES(amdv1_mock);
+struct pt_iommu_vtdss_cfg {
+ struct pt_iommu_cfg common;
+};
+
+struct pt_iommu_vtdss_hw_info {
+ u64 ssptptr;
+ u8 aw;
+};
+
+IOMMU_FORMAT(vtdss, vtdss_pt);
+
struct pt_iommu_x86_64_cfg {
struct pt_iommu_cfg common;
};