From 88a309465b3f05a100c3b81966982c0f9f5d23a6 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 20 Jan 2022 05:20:06 -0800 Subject: lib: zstd: clean up double word in comment. Remove the second 'a' and 'into'. Signed-off-by: Tom Rix Signed-off-by: Nick Terrell --- include/linux/zstd_lib.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h index b8c7dbf98390..6b91758b61af 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -1330,7 +1330,7 @@ ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals - * by merging them into into the literals of the next sequence. + * by merging them into the literals of the next sequence. * * As such, the final generated result has no explicit representation of block boundaries, * and the final last literals segment is not represented in the sequences. @@ -1377,7 +1377,7 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size /*! ZSTD_writeSkippableFrame() : * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. * - * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number, + * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. -- cgit From 7b6f08771bf6045c32a2a1e18201c1aeeb78d72a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 7 Sep 2022 10:34:30 +0300 Subject: wifi: ieee80211: Support validating ML station profile length Add a function to validate EHT Multi-Link per station profile length. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 79690938d9a2..bdf668f9dace 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4685,6 +4685,46 @@ struct ieee80211_mle_per_sta_profile { u8 variable[]; } __packed; +/** + * ieee80211_mle_sta_prof_size_ok - validate multi-link element sta profile size + * @data: pointer to the sub element data + * @len: length of the containing sub element + */ +static inline bool ieee80211_mle_sta_prof_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + info_len += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) + info_len += 1; + + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + info_len += 2; + else + info_len += 1; + } + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len <= len; +} + #define for_each_mle_subelement(_elem, _data, _len) \ if (ieee80211_mle_size_ok(_data, _len)) \ for_each_element(_elem, \ -- cgit From 1403b109c9a5244dc6ab79154f04eecc209ef3d2 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 7 Sep 2022 17:23:09 +0300 Subject: wifi: cfg80211/mac80211: Fix ML element common size calculation The common size is part of the length in the data so don't add it again. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index bdf668f9dace..442b13333da8 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4573,18 +4573,17 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: - common += sizeof(struct ieee80211_mle_basic_common_info); - break; case IEEE80211_ML_CONTROL_TYPE_PREQ: - common += sizeof(struct ieee80211_mle_preq_common_info); + case IEEE80211_ML_CONTROL_TYPE_TDLS: + /* + * The length is the first octet pointed by mle->variable so no + * need to add anything + */ break; case IEEE80211_ML_CONTROL_TYPE_RECONF: if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) common += ETH_ALEN; return common; - case IEEE80211_ML_CONTROL_TYPE_TDLS: - common += sizeof(struct ieee80211_mle_tdls_common_info); - break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) common += ETH_ALEN; -- cgit From fb99c7d4d6d0fb4fe5a953e0c5f6c37a5b796b98 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Sun, 11 Sep 2022 16:55:12 +0300 Subject: wifi: cfg80211/mac80211: Fix ML element common size validation The Multi-Link element can be fragmented, thus its size can exceed 254. Thus, modify ieee80211_mle_size_ok() to use 'size_t len' instead of 'u8 len'. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 442b13333da8..b935a85b2f44 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4601,7 +4601,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) * @data: pointer to the element data * @len: length of the containing element */ -static inline bool ieee80211_mle_size_ok(const u8 *data, u8 len) +static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) { const struct ieee80211_multi_link_elem *mle = (const void *)data; u8 fixed = sizeof(*mle); -- cgit From 45ebac4f059b92906e7e86dd1a780739f883857c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 6 Sep 2022 11:48:56 +0300 Subject: wifi: mac80211: Parse station profile from association response When processing an association response frame for a Multi-Link connection, extract the per station profile for each additional link, and use it for parsing the link elements. As the Multi-Link element might be fragmented, add support for reassembling a fragmented element. To simplify memory management logic, extend 'struct ieee802_11_elems' to hold a scratch buffer, which is used for the defragmentation. Once an element is reconstructed in the scratch area, point the corresponding element pointer to it. Currently only defragmentation of Multi-Link element and the contained per-STA profile subelement is supported. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b935a85b2f44..f51e939f28f2 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4666,6 +4666,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) enum ieee80211_mle_subelems { IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, + IEEE80211_MLE_SUBELEM_FRAGMENT = 254, }; #define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f -- cgit From 1177aaa7fe9373c762cd5bf5f5de8517bac989d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 17 Sep 2022 03:14:53 +0200 Subject: wifi: fix multi-link element subelement iteration The subelements obviously start after the common data, including the common multi-link element structure definition itself. This bug was possibly just hidden by the higher bits of the control being set to 0, so the iteration just found one bogus element and most of the code could continue anyway. Fixes: 0f48b8b88aa9 ("wifi: ieee80211: add definitions for multi-link element") Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f51e939f28f2..6252f02f38b7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4593,7 +4593,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) return 0; } - return common + mle->variable[0]; + return sizeof(*mle) + common + mle->variable[0]; } /** -- cgit From a6a6163b9a934df5965792d60af1f53aa51fdd39 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:03 +0200 Subject: mac802154: Introduce filtering levels The 802154 specification details several filtering levels in which the PHY and the MAC could be. The amount of filtering will vary if they are in promiscuous mode or in scanning mode. Otherwise they are expected to do some very basic checks, such as enforcing the frame validity. Either the PHY is able to do so, and the MAC has nothing to do, or the PHY has a lower filtering level than expected and the MAC should take over. For now we just define these levels in an enumeration. In a second time, we will add a per-PHY parameter showing the expected filtering level as well as a per device current filtering level, and will initialize all these fields. In a third time, we will use them to apply more filtering by software when the PHY is limited. Indeed, if the drivers know they cannot reach the requested level of filtering, they will overwrite the "current filtering" parameter so that it reflects what they do. Then, in the core, the expected filtering level will be used to decide whether some additional software processing is needed or not. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index f1f9412b6ac6..0303eb84d596 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -276,6 +276,30 @@ enum { IEEE802154_SYSTEM_ERROR = 0xff, }; +/** + * enum ieee802154_filtering_level - Filtering levels applicable to a PHY + * + * @IEEE802154_FILTERING_NONE: No filtering at all, what is received is + * forwarded to the softMAC + * @IEEE802154_FILTERING_1_FCS: First filtering level, frames with an invalid + * FCS should be dropped + * @IEEE802154_FILTERING_2_PROMISCUOUS: Second filtering level, promiscuous + * mode as described in the spec, identical in terms of filtering to the + * level one on PHY side, but at the MAC level the frame should be + * forwarded to the upper layer directly + * @IEEE802154_FILTERING_3_SCAN: Third filtering level, scan related, where + * only beacons must be processed, all remaining traffic gets dropped + * @IEEE802154_FILTERING_4_FRAME_FIELDS: Fourth filtering level actually + * enforcing the validity of the content of the frame with various checks + */ +enum ieee802154_filtering_level { + IEEE802154_FILTERING_NONE, + IEEE802154_FILTERING_1_FCS, + IEEE802154_FILTERING_2_PROMISCUOUS, + IEEE802154_FILTERING_3_SCAN, + IEEE802154_FILTERING_4_FRAME_FIELDS, +}; + /* frame control handling */ #define IEEE802154_FCTL_FTYPE 0x0003 #define IEEE802154_FCTL_ACKREQ 0x0020 -- cgit From 16988c742968d5ceb2e832a57bd277f51f0a59d7 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 8 Oct 2022 19:56:16 +0800 Subject: of/address: introduce of_address_count() helper Introduce of_address_count() helper to count the IO resources instead of open-coding it. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20221008115617.3583890-2-yangyingliang@huawei.com Signed-off-by: Rob Herring --- include/linux/of_address.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_address.h b/include/linux/of_address.h index 45598dbec269..265f26eeaf6b 100644 --- a/include/linux/of_address.h +++ b/include/linux/of_address.h @@ -154,4 +154,15 @@ static inline const __be32 *of_get_pci_address(struct device_node *dev, int bar_ return __of_get_address(dev, -1, bar_no, size, flags); } +static inline int of_address_count(struct device_node *np) +{ + struct resource res; + int count = 0; + + while (of_address_to_resource(np, count, &res) == 0) + count++; + + return count; +} + #endif /* __OF_ADDRESS_H */ -- cgit From 91924d9bb1df2a234a0e055c550abdbd49412071 Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Wed, 12 Oct 2022 19:46:22 +0200 Subject: of: declare string literals const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit of_overlay_action_name() returns a string literal from a function local array. Modifying string literals is undefined behavior which usage of const pointer can avoid. of_overlay_action_name() is currently only used once in overlay_notify() to print the returned value. While on it declare the data array const as well. Reported by Clang: In file included from arch/x86/kernel/asm-offsets.c:22: In file included from arch/x86/kernel/../kvm/vmx/vmx.h:5: In file included from ./include/linux/kvm_host.h:19: In file included from ./include/linux/msi.h:23: In file included from ./arch/x86/include/asm/msi.h:5: In file included from ./arch/x86/include/asm/irqdomain.h:5: In file included from ./include/linux/irqdomain.h:35: ./include/linux/of.h:1555:3: error: initializing 'char *' with an expression of type 'const char[5]' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] "init", ^~~~~~ ./include/linux/of.h:1556:3: error: initializing 'char *' with an expression of type 'const char[10]' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] "pre-apply", ^~~~~~~~~~~ ./include/linux/of.h:1557:3: error: initializing 'char *' with an expression of type 'const char[11]' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] "post-apply", ^~~~~~~~~~~~ ./include/linux/of.h:1558:3: error: initializing 'char *' with an expression of type 'const char[11]' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] "pre-remove", ^~~~~~~~~~~~ ./include/linux/of.h:1559:3: error: initializing 'char *' with an expression of type 'const char[12]' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] "post-remove", ^~~~~~~~~~~~~ Signed-off-by: Christian Göttsche Reviewed-by: Frank Rowand Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20221012174622.45006-1-cgzones@googlemail.com Signed-off-by: Rob Herring --- include/linux/of.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 6b79ef9a6541..8b9f94386dc3 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1549,9 +1549,9 @@ enum of_overlay_notify_action { OF_OVERLAY_POST_REMOVE, }; -static inline char *of_overlay_action_name(enum of_overlay_notify_action action) +static inline const char *of_overlay_action_name(enum of_overlay_notify_action action) { - static char *of_overlay_action_name[] = { + static const char *const of_overlay_action_name[] = { "init", "pre-apply", "post-apply", -- cgit From d49a0626216b95cd4bf696f6acf55f39a16ab0bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:10:47 +0200 Subject: arch: Introduce CONFIG_FUNCTION_ALIGNMENT Generic function-alignment infrastructure. Architectures can select FUNCTION_ALIGNMENT_xxB symbols; the FUNCTION_ALIGNMENT symbol is then set to the largest such selected size, 0 otherwise. From this the -falign-functions compiler argument and __ALIGN macro are set. This incorporates the DEBUG_FORCE_FUNCTION_ALIGN_64B knob and future alignment requirements for x86_64 (later in this series) into a single place. NOTE: also removes the 0x90 filler byte from the generic __ALIGN primitive, that value makes no sense outside of x86. NOTE: .balign 0 reverts to a no-op. Requested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.719248727@infradead.org --- include/linux/linkage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 1feab6136b5b..5c8865bb59d9 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -69,8 +69,8 @@ #endif #ifndef __ALIGN -#define __ALIGN .align 4,0x90 -#define __ALIGN_STR ".align 4,0x90" +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT +#define __ALIGN_STR __stringify(__ALIGN) #endif #ifdef __ASSEMBLY__ -- cgit From bea75b33895f7f87f0c40023e36a2d087e87ffa1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:18 +0200 Subject: x86/Kconfig: Introduce function padding Now that all functions are 16 byte aligned, add 16 bytes of NOP padding in front of each function. This prepares things for software call stack tracking and kCFI/FineIBT. This significantly increases kernel .text size, around 5.1% on a x86_64-defconfig-ish build. However, per the random access argument used for alignment, these 16 extra bytes are code that wouldn't be used. Performance measurements back this up by showing no significant performance regressions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.950884492@infradead.org --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..5296aea9b5b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,11 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 +#ifdef CONFIG_CALL_THUNKS +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) +#else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#endif #else #define BPF_DISPATCHER_ATTRIBUTES #endif -- cgit From 7825451fa4dc04660f1f53d236e4302161d0ebd1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:31 +0200 Subject: static_call: Add call depth tracking support When indirect calls are switched to direct calls then it has to be ensured that the call target is not the function, but the call thunk when call depth tracking is enabled. But static calls are available before call thunks have been set up. Ensure a second run through the static call patching code after call thunks have been created. When call thunks are not enabled this has no side effects. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.306100465@infradead.org --- include/linux/static_call.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index df53bed9d71f..141e6b176a1b 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -162,6 +162,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool extern int __init static_call_init(void); +extern void static_call_force_reinit(void); + struct static_call_mod { struct static_call_mod *next; struct module *mod; /* for vmlinux, mod == NULL */ -- cgit From 3b87d9f436b6893503f43f487d3bc6091d9db178 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 17 Oct 2022 14:08:09 -0400 Subject: fs: edit a comment made in bad taste I know nobody likes a buzzkill, but I figure it's best to keep the bad jokes appropriate for small children. Signed-off-by: Paul Moore --- include/linux/fs_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 13fa6f3df8e4..6fbf49cc10e4 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -99,7 +99,7 @@ struct fs_context { const struct cred *cred; /* The mounter's credentials */ struct p_log log; /* Logging buffer */ const char *source; /* The source name (eg. dev path) */ - void *security; /* Linux S&M options */ + void *security; /* LSM options */ void *s_fs_info; /* Proposed s_fs_info */ unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ unsigned int sb_flags_mask; /* Superblock flags that were changed */ -- cgit From 8bd4da0f0626ae9a82099d3d99cd6efd4355879b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Oct 2022 13:01:10 -0700 Subject: pstore/ram: Move internal definitions out of kernel-wide include Most of the details of the ram backend are entirely internal to the backend itself. Leave only what is needed to instantiate a ram backend in the kernel-wide header. Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Signed-off-by: Kees Cook Reviewed-and-tested-by: Guilherme G. Piccoli Link: https://lore.kernel.org/r/20221011200112.731334-4-keescook@chromium.org --- include/linux/pstore_ram.h | 99 ---------------------------------------------- 1 file changed, 99 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 9f16afec7290..9d65ff94e216 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -8,28 +8,7 @@ #ifndef __LINUX_PSTORE_RAM_H__ #define __LINUX_PSTORE_RAM_H__ -#include -#include -#include -#include -#include #include -#include - -/* - * Choose whether access to the RAM zone requires locking or not. If a zone - * can be written to from different CPUs like with ftrace for example, then - * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required. - */ -#define PRZ_FLAG_NO_LOCK BIT(0) -/* - * If a PRZ should only have a single-boot lifetime, this marks it as - * getting wiped after its contents get copied out after boot. - */ -#define PRZ_FLAG_ZAP_OLD BIT(1) - -struct persistent_ram_buffer; -struct rs_control; struct persistent_ram_ecc_info { int block_size; @@ -39,84 +18,6 @@ struct persistent_ram_ecc_info { uint16_t *par; }; -/** - * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ) - * used as a pstore backend - * - * @paddr: physical address of the mapped RAM area - * @size: size of mapping - * @label: unique name of this PRZ - * @type: frontend type for this PRZ - * @flags: holds PRZ_FLAGS_* bits - * - * @buffer_lock: - * locks access to @buffer "size" bytes and "start" offset - * @buffer: - * pointer to actual RAM area managed by this PRZ - * @buffer_size: - * bytes in @buffer->data (not including any trailing ECC bytes) - * - * @par_buffer: - * pointer into @buffer->data containing ECC bytes for @buffer->data - * @par_header: - * pointer into @buffer->data containing ECC bytes for @buffer header - * (i.e. all fields up to @data) - * @rs_decoder: - * RSLIB instance for doing ECC calculations - * @corrected_bytes: - * ECC corrected bytes accounting since boot - * @bad_blocks: - * ECC uncorrectable bytes accounting since boot - * @ecc_info: - * ECC configuration details - * - * @old_log: - * saved copy of @buffer->data prior to most recent wipe - * @old_log_size: - * bytes contained in @old_log - * - */ -struct persistent_ram_zone { - phys_addr_t paddr; - size_t size; - void *vaddr; - char *label; - enum pstore_type_id type; - u32 flags; - - raw_spinlock_t buffer_lock; - struct persistent_ram_buffer *buffer; - size_t buffer_size; - - char *par_buffer; - char *par_header; - struct rs_control *rs_decoder; - int corrected_bytes; - int bad_blocks; - struct persistent_ram_ecc_info ecc_info; - - char *old_log; - size_t old_log_size; -}; - -struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, - u32 sig, struct persistent_ram_ecc_info *ecc_info, - unsigned int memtype, u32 flags, char *label); -void persistent_ram_free(struct persistent_ram_zone *prz); -void persistent_ram_zap(struct persistent_ram_zone *prz); - -int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, - unsigned int count); -int persistent_ram_write_user(struct persistent_ram_zone *prz, - const void __user *s, unsigned int count); - -void persistent_ram_save_old(struct persistent_ram_zone *prz); -size_t persistent_ram_old_size(struct persistent_ram_zone *prz); -void *persistent_ram_old(struct persistent_ram_zone *prz); -void persistent_ram_free_old(struct persistent_ram_zone *prz); -ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, - char *str, size_t len); - /* * Ramoops platform data * @mem_size memory size for ramoops -- cgit From fddb1a6424787d8089a9032bd5d21c428670f854 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 7 Oct 2022 15:23:37 +0200 Subject: ata: add ata_port_is_frozen() helper At the request of the libata maintainer, introduce a ata_port_is_frozen() helper function. Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/libata.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index fe990176e6ee..af4953b95f76 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1043,6 +1043,11 @@ static inline int ata_port_is_dummy(struct ata_port *ap) return ap->ops == &ata_dummy_port_ops; } +static inline bool ata_port_is_frozen(const struct ata_port *ap) +{ + return ap->pflags & ATA_PFLAG_FROZEN; +} + extern int ata_std_prereset(struct ata_link *link, unsigned long deadline); extern int ata_wait_after_reset(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)); -- cgit From ed5a7047d2011cb6b2bf84ceb6680124cc6a7d95 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 17 Oct 2022 17:06:37 +0200 Subject: attr: use consistent sgid stripping checks Currently setgid stripping in file_remove_privs()'s should_remove_suid() helper is inconsistent with other parts of the vfs. Specifically, it only raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the inode isn't in the caller's groups and the caller isn't privileged over the inode although we require this already in setattr_prepare() and setattr_copy() and so all filesystem implement this requirement implicitly because they have to use setattr_{prepare,copy}() anyway. But the inconsistency shows up in setgid stripping bugs for overlayfs in xfstests (e.g., generic/673, generic/683, generic/685, generic/686, generic/687). For example, we test whether suid and setgid stripping works correctly when performing various write-like operations as an unprivileged user (fallocate, reflink, write, etc.): echo "Test 1 - qa_user, non-exec file $verb" setup_testfile chmod a+rws $junk_file commit_and_check "$qa_user" "$verb" 64k 64k The test basically creates a file with 6666 permissions. While the file has the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a regular filesystem like xfs what will happen is: sys_fallocate() -> vfs_fallocate() -> xfs_file_fallocate() -> file_modified() -> __file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = ATTR_FORCE | kill; -> notify_change() -> setattr_copy() In should_remove_suid() we can see that ATTR_KILL_SUID is raised unconditionally because the file in the test has S_ISUID set. But we also see that ATTR_KILL_SGID won't be set because while the file is S_ISGID it is not S_IXGRP (see above) which is a condition for ATTR_KILL_SGID being raised. So by the time we call notify_change() we have attr->ia_valid set to ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that ATTR_KILL_SUID is set and does: ia_valid = attr->ia_valid |= ATTR_MODE attr->ia_mode = (inode->i_mode & ~S_ISUID); which means that when we call setattr_copy() later we will definitely update inode->i_mode. Note that attr->ia_mode still contains S_ISGID. Now we call into the filesystem's ->setattr() inode operation which will end up calling setattr_copy(). Since ATTR_MODE is set we will hit: if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } and since the caller in the test is neither capable nor in the group of the inode the S_ISGID bit is stripped. But assume the file isn't suid then ATTR_KILL_SUID won't be raised which has the consequence that neither the setgid nor the suid bits are stripped even though it should be stripped because the inode isn't in the caller's groups and the caller isn't privileged over the inode. If overlayfs is in the mix things become a bit more complicated and the bug shows up more clearly. When e.g., ovl_setattr() is hit from ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and ATTR_KILL_SGID might be raised but because the check in notify_change() is questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be stripped the S_ISGID bit isn't removed even though it should be stripped: sys_fallocate() -> vfs_fallocate() -> ovl_fallocate() -> file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = ATTR_FORCE | kill; -> notify_change() -> ovl_setattr() // TAKE ON MOUNTER'S CREDS -> ovl_do_notify_change() -> notify_change() // GIVE UP MOUNTER'S CREDS // TAKE ON MOUNTER'S CREDS -> vfs_fallocate() -> xfs_file_fallocate() -> file_modified() -> __file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = attr_force | kill; -> notify_change() The fix for all of this is to make file_remove_privs()'s should_remove_suid() helper to perform the same checks as we already require in setattr_prepare() and setattr_copy() and have notify_change() not pointlessly requiring S_IXGRP again. It doesn't make any sense in the first place because the caller must calculate the flags via should_remove_suid() anyway which would raise ATTR_KILL_SGID. While we're at it we move should_remove_suid() from inode.c to attr.c where it belongs with the rest of the iattr helpers. Especially since it returns ATTR_KILL_S{G,U}ID flags. We also rename it to setattr_should_drop_suidgid() to better reflect that it indicates both setuid and setgid bit removal and also that it returns attr flags. Running xfstests with this doesn't report any regressions. We should really try and use consistent checks. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..b39c5efca180 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3104,7 +3104,7 @@ extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); -extern int should_remove_suid(struct dentry *); +extern int setattr_should_drop_suidgid(struct user_namespace *, struct inode *); extern int file_remove_privs(struct file *); /* -- cgit From e6c86c513f440bec5f1046539c7e3c6c653842da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Oct 2022 19:39:43 +0800 Subject: rcu-tasks: Provide rcu_trace_implies_rcu_gp() As an accident of implementation, an RCU Tasks Trace grace period also acts as an RCU grace period. However, this could change at any time. This commit therefore creates an rcu_trace_implies_rcu_gp() that currently returns true to codify this accident. Code relying on this accident must call this function to verify that this accident is still happening. Reported-by: Hou Tao Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Martin KaFai Lau Link: https://lore.kernel.org/r/20221014113946.965131-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 08605ce7379d..8822f06e4b40 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ +/** + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? + * + * As an accident of implementation, an RCU Tasks Trace grace period also + * acts as an RCU grace period. However, this could change at any time. + * Code relying on this accident must call this function to verify that + * this accident is still happening. + * + * You have been warned! + */ +static inline bool rcu_trace_implies_rcu_gp(void) { return true; } + /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * -- cgit From 5d0f5953b60f5f7a278085b55ddc73e2932f4c33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Sep 2022 12:09:30 -0700 Subject: srcu: Convert ->srcu_lock_count and ->srcu_unlock_count to atomic NMI-safe variants of srcu_read_lock() and srcu_read_unlock() are needed by printk(), which on many architectures entails read-modify-write atomic operations. This commit prepares Tree SRCU for this change by making both ->srcu_lock_count and ->srcu_unlock_count by atomic_long_t. [ paulmck: Apply feedback from John Ogness. ] Link: https://lore.kernel.org/all/20220910221947.171557773@linutronix.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: John Ogness Cc: Petr Mladek --- include/linux/srcutree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index e3014319d1ad..0c4eca07d78d 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -23,8 +23,8 @@ struct srcu_struct; */ struct srcu_data { /* Read-side state. */ - unsigned long srcu_lock_count[2]; /* Locks per CPU. */ - unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ + atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ + atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; -- cgit From b5ad0d2e8832c770b3ff2887ae37b47f42a3ab82 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Thu, 15 Sep 2022 16:38:24 +0800 Subject: rcu: Remove unused 'cpu' in rcu_virt_note_context_switch() This commit removes the unused function argument 'cpu'. This does not change functionality, but might save a cycle or two. Signed-off-by: Zeng Heng Acked-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- include/linux/kvm_host.h | 2 +- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32f259fa5801..381b92d146c7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -416,7 +416,7 @@ static __always_inline void guest_context_enter_irqoff(void) */ if (!context_tracking_guest_enter()) { instrumentation_begin(); - rcu_virt_note_context_switch(smp_processor_id()); + rcu_virt_note_context_switch(); instrumentation_end(); } } diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 768196a5f39d..9bc025aa79a3 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -142,7 +142,7 @@ static inline int rcu_needs_cpu(void) * Take advantage of the fact that there is only one CPU, which * allows us to ignore virtualization-based context switches. */ -static inline void rcu_virt_note_context_switch(int cpu) { } +static inline void rcu_virt_note_context_switch(void) { } static inline void rcu_cpu_stall_reset(void) { } static inline int rcu_jiffies_till_stall_check(void) { return 21 * HZ; } static inline void rcu_irq_exit_check_preempt(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5efb51486e8a..70795386b9ff 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -27,7 +27,7 @@ void rcu_cpu_stall_reset(void); * wrapper around rcu_note_context_switch(), which allows TINY_RCU * to save a few bytes. The caller must have disabled interrupts. */ -static inline void rcu_virt_note_context_switch(int cpu) +static inline void rcu_virt_note_context_switch(void) { rcu_note_context_switch(false); } -- cgit From a2fd08448f2b7591fc7ec8dec2e5e025a43f0cee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Oct 2022 14:18:26 +0200 Subject: net: remove smc911x driver This driver was used on Arm and SH machines until 2009, when the last platforms moved to the smsc911x driver for the same hardware. Time to retire this version. Link: https://lore.kernel.org/netdev/1232010482-3744-1-git-send-email-steve.glendinning@smsc.com/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20221017121900.3520108-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/smc911x.h | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 include/linux/smc911x.h (limited to 'include/linux') diff --git a/include/linux/smc911x.h b/include/linux/smc911x.h deleted file mode 100644 index 8cace8189e74..000000000000 --- a/include/linux/smc911x.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __SMC911X_H__ -#define __SMC911X_H__ - -#define SMC911X_USE_16BIT (1 << 0) -#define SMC911X_USE_32BIT (1 << 1) - -struct smc911x_platdata { - unsigned long flags; - unsigned long irq_flags; /* IRQF_... */ - int irq_polarity; -}; - -#endif /* __SMC911X_H__ */ -- cgit From 3350607dc5637be2563f484dcfe2fed456f3d4ff Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Tue, 18 Oct 2022 20:22:06 +0200 Subject: security: Create file_truncate hook from path_truncate hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like path_truncate, the file_truncate hook also restricts file truncation, but is called in the cases where truncation is attempted on an already-opened file. This is required in a subsequent commit to handle ftruncate() operations differently to truncate() operations. Acked-by: Tetsuo Handa Acked-by: John Johansen Acked-by: Paul Moore Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20221018182216.301684-2-gnoack3000@gmail.com Signed-off-by: Mickaël Salaün --- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 10 +++++++++- include/linux/security.h | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ec119da1d89b..f67025823d92 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -177,6 +177,7 @@ LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk, struct fown_struct *fown, int sig) LSM_HOOK(int, 0, file_receive, struct file *file) LSM_HOOK(int, 0, file_open, struct file *file) +LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, unsigned long clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 4ec80b96c22e..fad93a6d5293 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -409,7 +409,9 @@ * @attr is the iattr structure containing the new file attributes. * Return 0 if permission is granted. * @path_truncate: - * Check permission before truncating a file. + * Check permission before truncating the file indicated by path. + * Note that truncation permissions may also be checked based on + * already opened files, using the @file_truncate hook. * @path contains the path structure for the file. * Return 0 if permission is granted. * @inode_getattr: @@ -598,6 +600,12 @@ * to receive an open file descriptor via socket IPC. * @file contains the file structure being received. * Return 0 if permission is granted. + * @file_truncate: + * Check permission before truncating a file, i.e. using ftruncate. + * Note that truncation permission may also be checked based on the path, + * using the @path_truncate hook. + * @file contains the file structure for the file. + * Return 0 if permission is granted. * @file_open: * Save open-time permission checking state for later use upon * file_permission, and recheck access if anything has changed diff --git a/include/linux/security.h b/include/linux/security.h index ca1b7109c0db..1b5de26bdb12 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -396,6 +396,7 @@ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig); int security_file_receive(struct file *file); int security_file_open(struct file *file); +int security_file_truncate(struct file *file); int security_task_alloc(struct task_struct *task, unsigned long clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); @@ -1014,6 +1015,11 @@ static inline int security_file_open(struct file *file) return 0; } +static inline int security_file_truncate(struct file *file) +{ + return 0; +} + static inline int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { -- cgit From 653714220f88717f239d811222c2dd0e64b181ba Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 6 Jun 2022 16:37:57 -0400 Subject: signal/compat: Remove compat_sigset_t override x86 no longer uses compat_sigset_t when CONFIG_COMPAT isn't enabled, so remove the override define. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/r/20220606203802.158958-4-brgerst@gmail.com Signed-off-by: Borislav Petkov --- include/linux/compat.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 594357881b0b..44b1736c95b5 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -126,11 +126,9 @@ struct compat_tms { #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) -#ifndef compat_sigset_t typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; -#endif int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize); -- cgit From 24e6dc35ccd825de7c71751610ff8f3295347e5b Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 6 Jun 2022 16:38:01 -0400 Subject: x86/signal/32: Merge native and compat 32-bit signal code There are significant differences between signal handling on 32-bit vs. 64-bit, like different structure layouts and legacy syscalls. Instead of duplicating that code for native and compat, merge both versions into one file. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/r/20220606203802.158958-8-brgerst@gmail.com Signed-off-by: Borislav Petkov --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a34b0f9a9972..33a0ee3bcb2e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -264,6 +264,7 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) #define SC_VAL64(type, name) ((type) name##_hi << 32 | name##_lo) #ifdef CONFIG_COMPAT +#define SYSCALL32_DEFINE0 COMPAT_SYSCALL_DEFINE0 #define SYSCALL32_DEFINE1 COMPAT_SYSCALL_DEFINE1 #define SYSCALL32_DEFINE2 COMPAT_SYSCALL_DEFINE2 #define SYSCALL32_DEFINE3 COMPAT_SYSCALL_DEFINE3 @@ -271,6 +272,7 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) #define SYSCALL32_DEFINE5 COMPAT_SYSCALL_DEFINE5 #define SYSCALL32_DEFINE6 COMPAT_SYSCALL_DEFINE6 #else +#define SYSCALL32_DEFINE0 SYSCALL_DEFINE0 #define SYSCALL32_DEFINE1 SYSCALL_DEFINE1 #define SYSCALL32_DEFINE2 SYSCALL_DEFINE2 #define SYSCALL32_DEFINE3 SYSCALL_DEFINE3 -- cgit From 138060ba92b3b0d77c8e6818d0f33398b23ea42e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Sep 2022 10:29:39 +0200 Subject: fs: pass dentry to set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. Since some filesystem rely on the dentry being available to them when setting posix acls (e.g., 9p and cifs) they cannot rely on set acl inode operation. But since ->set_acl() is required in order to use the generic posix acl xattr handlers filesystems that do not implement this inode operation cannot use the handler and need to implement their own dedicated posix acl handlers. Update the ->set_acl() inode method to take a dentry argument. This allows all filesystems to rely on ->set_acl(). As far as I can tell all codepaths can be switched to rely on the dentry instead of just the inode. Note that the original motivation for passing the dentry separate from the inode instead of just the dentry in the xattr handlers was because of security modules that call security_d_instantiate(). This hook is called during d_instantiate_new(), d_add(), __d_instantiate_anon(), and d_splice_alias() to initialize the inode's security context and possibly to set security.* xattrs. Since this only affects security.* xattrs this is completely irrelevant for posix acls. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 2 +- include/linux/posix_acl.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..3db0b23c6a55 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2172,7 +2172,7 @@ struct inode_operations { umode_t create_mode); int (*tmpfile) (struct user_namespace *, struct inode *, struct file *, umode_t); - int (*set_acl)(struct user_namespace *, struct inode *, + int (*set_acl)(struct user_namespace *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 7d1e604c1325..cd16a756cd1e 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -69,21 +69,21 @@ extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); -extern int set_posix_acl(struct user_namespace *, struct inode *, int, - struct posix_acl *); +int set_posix_acl(struct user_namespace *, struct dentry *, int, + struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL -int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t); +int posix_acl_chmod(struct user_namespace *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *, struct posix_acl **); -extern int simple_set_acl(struct user_namespace *, struct inode *, - struct posix_acl *, int); +int simple_set_acl(struct user_namespace *, struct dentry *, + struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); @@ -101,7 +101,7 @@ static inline void cache_no_acl(struct inode *inode) } #else static inline int posix_acl_chmod(struct user_namespace *mnt_userns, - struct inode *inode, umode_t mode) + struct dentry *dentry, umode_t mode) { return 0; } -- cgit From f392a1846489720fc2e063d1210633b6cf4ec5a4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 17 Oct 2022 16:22:35 -0400 Subject: net: phylink: provide phylink_validate_mask_caps() helper Provide a helper that restricts the link modes according to the phylink capabilities. Signed-off-by: Russell King (Oracle) [rebased on net-next/master and added documentation] Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 664dd409feb9..c29c3f174972 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -556,6 +556,9 @@ void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); unsigned long phylink_get_capabilities(phy_interface_t interface, unsigned long mac_capabilities, int rate_matching); +void phylink_validate_mask_caps(unsigned long *supported, + struct phylink_link_state *state, + unsigned long caps); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit From 1e400cb9cff2157f89ca95aba4589f95253425ba Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Tue, 11 Oct 2022 11:50:37 +0530 Subject: firmware: xilinx: Add qspi firmware interface Add support for QSPI ioctl functions and enums. Signed-off-by: Rajan Vaja Signed-off-by: Michal Simek Signed-off-by: Amit Kumar Mahapatra Link: https://lore.kernel.org/r/20221011062040.12116-5-amit.kumar-mahapatra@amd.com Signed-off-by: Mark Brown --- include/linux/firmware/xlnx-zynqmp.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 76d2b3ebad84..fac37680ffe7 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -135,6 +135,7 @@ enum pm_ret_status { }; enum pm_ioctl_id { + IOCTL_SET_TAPDELAY_BYPASS = 4, IOCTL_SD_DLL_RESET = 6, IOCTL_SET_SD_TAPDELAY = 7, IOCTL_SET_PLL_FRAC_MODE = 8, @@ -389,6 +390,18 @@ enum zynqmp_pm_shutdown_subtype { ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM = 2, }; +enum tap_delay_signal_type { + PM_TAPDELAY_NAND_DQS_IN = 0, + PM_TAPDELAY_NAND_DQS_OUT = 1, + PM_TAPDELAY_QSPI = 2, + PM_TAPDELAY_MAX = 3, +}; + +enum tap_delay_bypass_ctrl { + PM_TAPDELAY_BYPASS_DISABLE = 0, + PM_TAPDELAY_BYPASS_ENABLE = 1, +}; + enum ospi_mux_select_type { PM_OSPI_MUX_SEL_DMA = 0, PM_OSPI_MUX_SEL_LINEAR = 1, @@ -484,6 +497,7 @@ int zynqmp_pm_write_ggs(u32 index, u32 value); int zynqmp_pm_read_ggs(u32 index, u32 *value); int zynqmp_pm_write_pggs(u32 index, u32 value); int zynqmp_pm_read_pggs(u32 index, u32 *value); +int zynqmp_pm_set_tapdelay_bypass(u32 index, u32 value); int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype); int zynqmp_pm_set_boot_health_status(u32 value); int zynqmp_pm_pinctrl_request(const u32 pin); @@ -696,6 +710,11 @@ static inline int zynqmp_pm_read_pggs(u32 index, u32 *value) return -ENODEV; } +static inline int zynqmp_pm_set_tapdelay_bypass(u32 index, u32 value) +{ + return -ENODEV; +} + static inline int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype) { return -ENODEV; -- cgit From 9ceb338ab1769fd04c7d347f086c3b5ee01128e1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Oct 2022 16:44:44 +0300 Subject: gpio: aspeed: Add missing header(s) Do not imply that some of the generic headers may be always included. Instead, include explicitly what we are direct user of. While at it, sort headers alphabetically. Signed-off-by: Andy Shevchenko --- include/linux/gpio/aspeed.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/aspeed.h b/include/linux/gpio/aspeed.h index 1bfb3cdc86d0..9a547e66c8c4 100644 --- a/include/linux/gpio/aspeed.h +++ b/include/linux/gpio/aspeed.h @@ -1,6 +1,10 @@ #ifndef __GPIO_ASPEED_H #define __GPIO_ASPEED_H +#include + +struct gpio_desc; + struct aspeed_gpio_copro_ops { int (*request_access)(void *data); int (*release_access)(void *data); -- cgit From 51c352bdbcd23d7ce46b06c1e64c82754dc44044 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 18 Oct 2022 15:37:27 +0100 Subject: netlink: add support for formatted extack messages Include an 80-byte buffer in struct netlink_ext_ack that can be used for scnprintf()ed messages. This does mean that the resulting string can't be enumerated, translated etc. in the way NL_SET_ERR_MSG() was designed to allow. Signed-off-by: Edward Cree Reviewed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index d51e041d2242..d81bde5a5844 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -64,6 +64,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 +#define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct @@ -75,6 +76,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length + * @_msg_buf: output buffer for formatted message strings - don't access + * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; @@ -84,13 +87,13 @@ struct netlink_ext_ack { u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; + char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. - * Currently string formatting is not supported (due - * to the lack of an output buffer.) + * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ @@ -102,9 +105,31 @@ struct netlink_ext_ack { __extack->_msg = __msg; \ } while (0) +/* We splice fmt with %s at each end even in the snprintf so that both calls + * can use the same string constant, avoiding its duplication in .ro + */ +#define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (!__extack) \ + break; \ + if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ + "%s" fmt "%s", "", ##args, "") >= \ + NETLINK_MAX_FMTMSG_LEN) \ + net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ + ##args, "\n"); \ + \ + do_trace_netlink_extack(__extack->_msg_buf); \ + \ + __extack->_msg = __extack->_msg_buf; \ +} while (0) + #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) +#define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ + NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) + #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ -- cgit From cac2f8b8d8b50ef32b3e34f6dcbbf08937e4f616 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:00 +0200 Subject: fs: rename current get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. The current inode operation for getting posix acls takes an inode argument but various filesystems (e.g., 9p, cifs, overlayfs) need access to the dentry. In contrast to the ->set_acl() inode operation we cannot simply extend ->get_acl() to take a dentry argument. The ->get_acl() inode operation is called from: acl_permission_check() -> check_acl() -> get_acl() which is part of generic_permission() which in turn is part of inode_permission(). Both generic_permission() and inode_permission() are called in the ->permission() handler of various filesystems (e.g., overlayfs). So simply passing a dentry argument to ->get_acl() would amount to also having to pass a dentry argument to ->permission(). We should avoid this unnecessary change. So instead of extending the existing inode operation rename it from ->get_acl() to ->get_inode_acl() and add a ->get_acl() method later that passes a dentry argument and which filesystems that need access to the dentry can implement instead of ->get_inode_acl(). Filesystems like cifs which allow setting and getting posix acls but not using them for permission checking during lookup can simply not implement ->get_inode_acl(). This is intended to be a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Suggested-by/Inspired-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 6 +++--- include/linux/posix_acl.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3db0b23c6a55..2395e1388e2e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -560,8 +560,8 @@ struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to - * cache the ACL. This also means that ->get_acl() can be called in RCU mode - * with the LOOKUP_RCU flag. + * cache the ACL. This also means that ->get_inode_acl() can be called in RCU + * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) @@ -2142,7 +2142,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct user_namespace *, struct inode *, int); - struct posix_acl * (*get_acl)(struct inode *, int, bool); + struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index cd16a756cd1e..07e171b4428a 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -128,6 +128,6 @@ static inline void forget_all_cached_acls(struct inode *inode) } #endif /* CONFIG_FS_POSIX_ACL */ -struct posix_acl *get_acl(struct inode *inode, int type); +struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */ -- cgit From 7420332a6ff407ba2d3d25f5e8430bf426131d1d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:01 +0200 Subject: fs: add new get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. Since some filesystem rely on the dentry being available to them when setting posix acls (e.g., 9p and cifs) they cannot rely on the old get acl inode operation to retrieve posix acl and need to implement their own custom handlers because of that. In a previous patch we renamed the old get acl inode operation to ->get_inode_acl(). We decided to rename it and implement a new one since ->get_inode_acl() is called generic_permission() and inode_permission() both of which can be called during an filesystem's ->permission() handler. So simply passing a dentry argument to ->get_acl() would have amounted to also having to pass a dentry argument to ->permission(). We avoided that change. This adds a new ->get_acl() inode operations which takes a dentry argument which filesystems such as 9p, cifs, and overlayfs can implement to get posix acls. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2395e1388e2e..255f6eff89d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2172,6 +2172,8 @@ struct inode_operations { umode_t create_mode); int (*tmpfile) (struct user_namespace *, struct inode *, struct file *, umode_t); + struct posix_acl *(*get_acl)(struct user_namespace *, struct dentry *, + int); int (*set_acl)(struct user_namespace *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct user_namespace *mnt_userns, -- cgit From 6cd4d4e8b6e1495eb0cafa3a59d1fde137a98d22 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:04 +0200 Subject: 9p: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far 9p implemented a ->get_inode_acl() operation that didn't require access to the dentry in order to allow (limited) permission checking via posix acls in the vfs. Now that we have get and set acl inode operations that take a dentry argument we can give 9p get and set acl inode operations. This is mostly a refactoring of the codepaths currently used in 9p posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the 9p specific posix acl xattr handler and associated code will be removed. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl_xattr.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 8163dd48c430..ebfa11ac7046 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -60,6 +60,17 @@ int posix_acl_to_xattr(struct user_namespace *user_ns, struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, struct user_namespace *fs_userns, const void *value, size_t size); +static inline const char *posix_acl_xattr_name(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return XATTR_NAME_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return XATTR_NAME_POSIX_ACL_DEFAULT; + } + + return ""; +} extern const struct xattr_handler posix_acl_access_xattr_handler; extern const struct xattr_handler posix_acl_default_xattr_handler; -- cgit From 72b3897e78107c54e3e5a98bdb316dafcd818f97 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:07 +0200 Subject: security: add get, remove and set acl hook The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. So far posix acls were passed as a void blob to the security and integrity modules. Some of them like evm then proceed to interpret the void pointer and convert it into the kernel internal struct posix acl representation to perform their integrity checking magic. This is obviously pretty problematic as that requires knowledge that only the vfs is guaranteed to have and has lead to various bugs. Add a proper security hook for setting posix acls and pass down the posix acls in their appropriate vfs format instead of hacking it through a void pointer stored in the uapi format. In the next patches we implement the hooks for the few security modules that do actually have restrictions on posix acls. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Acked-by: Paul Moore Signed-off-by: Christian Brauner (Microsoft) --- include/linux/lsm_hook_defs.h | 6 ++++++ include/linux/lsm_hooks.h | 12 ++++++++++++ include/linux/security.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ec119da1d89b..7f4aaddce298 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -145,6 +145,12 @@ LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns, struct dentry *dentry, const char *name) +LSM_HOOK(int, 0, inode_set_acl, struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) +LSM_HOOK(int, 0, inode_get_acl, struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name) +LSM_HOOK(int, 0, inode_remove_acl, struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns, struct dentry *dentry) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 4ec80b96c22e..1d02d1170e21 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -435,6 +435,18 @@ * Check permission before removing the extended attribute * identified by @name for @dentry. * Return 0 if permission is granted. + * @inode_set_acl: + * Check permission before setting posix acls + * The posix acls in @kacl are identified by @acl_name. + * Return 0 if permission is granted. + * @inode_get_acl: + * Check permission before getting osix acls + * The posix acls are identified by @acl_name. + * Return 0 if permission is granted. + * @inode_remove_acl: + * Check permission before removing posix acls + * The posix acls are identified by @acl_name. + * Return 0 if permission is granted. * @inode_getsecurity: * Retrieve a copy of the extended attribute representation of the * security label associated with @name for @inode via @buffer. Note that diff --git a/include/linux/security.h b/include/linux/security.h index ca1b7109c0db..2bfc2e1ce51f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -361,6 +361,13 @@ int security_inode_getattr(const struct path *path); int security_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags); +int security_inode_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + struct posix_acl *kacl); +int security_inode_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name); +int security_inode_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name); void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int security_inode_getxattr(struct dentry *dentry, const char *name); @@ -872,6 +879,28 @@ static inline int security_inode_setxattr(struct user_namespace *mnt_userns, return cap_inode_setxattr(dentry, name, value, size, flags); } +static inline int security_inode_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name, + struct posix_acl *kacl) +{ + return 0; +} + +static inline int security_inode_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return 0; +} + +static inline int security_inode_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return 0; +} + static inline void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { } -- cgit From e61b135f7bfe47f547fb566328a97ca8baa3548c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:10 +0200 Subject: integrity: implement get and set acl hook The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. So far posix acls were passed as a void blob to the security and integrity modules. Some of them like evm then proceed to interpret the void pointer and convert it into the kernel internal struct posix acl representation to perform their integrity checking magic. This is obviously pretty problematic as that requires knowledge that only the vfs is guaranteed to have and has lead to various bugs. Add a proper security hook for setting posix acls and pass down the posix acls in their appropriate vfs format instead of hacking it through a void pointer stored in the uapi format. I spent considerate time in the security module and integrity infrastructure and audited all codepaths. EVM is the only part that really has restrictions based on the actual posix acl values passed through it (e.g., i_mode). Before this dedicated hook EVM used to translate from the uapi posix acl format sent to it in the form of a void pointer into the vfs format. This is not a good thing. Instead of hacking around in the uapi struct give EVM the posix acls in the appropriate vfs format and perform sane permissions checks that mirror what it used to to in the generic xattr hook. IMA doesn't have any restrictions on posix acls. When posix acls are changed it just wants to update its appraisal status to trigger an EVM revalidation. The removal of posix acls is equivalent to passing NULL to the posix set acl hooks. This is the same as before through the generic xattr api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Acked-by: Paul Moore (LSM) Signed-off-by: Christian Brauner (Microsoft) --- include/linux/evm.h | 23 +++++++++++++++++++++++ include/linux/ima.h | 24 ++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index aa63e0b3c0a2..86139be48992 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -35,6 +35,15 @@ extern int evm_inode_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name); +extern int evm_inode_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + struct posix_acl *kacl); +static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return evm_inode_set_acl(mnt_userns, dentry, acl_name, NULL); +} extern int evm_inode_init_security(struct inode *inode, const struct xattr *xattr_array, struct xattr *evm); @@ -108,6 +117,20 @@ static inline void evm_inode_post_removexattr(struct dentry *dentry, return; } +static inline int evm_inode_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + struct posix_acl *kacl) +{ + return 0; +} + +static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return 0; +} + static inline int evm_inode_init_security(struct inode *inode, const struct xattr *xattr_array, struct xattr *evm) diff --git a/include/linux/ima.h b/include/linux/ima.h index 81708ca0ebc7..5a0b2a285a18 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -187,6 +187,15 @@ extern void ima_inode_post_setattr(struct user_namespace *mnt_userns, struct dentry *dentry); extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); +extern int ima_inode_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + struct posix_acl *kacl); +static inline int ima_inode_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return ima_inode_set_acl(mnt_userns, dentry, acl_name, NULL); +} extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name); #else static inline bool is_ima_appraise_enabled(void) @@ -208,11 +217,26 @@ static inline int ima_inode_setxattr(struct dentry *dentry, return 0; } +static inline int ima_inode_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + struct posix_acl *kacl) +{ + + return 0; +} + static inline int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name) { return 0; } + +static inline int ima_inode_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return 0; +} #endif /* CONFIG_IMA_APPRAISE */ #if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) -- cgit From a56df5d5b7ca6d79c3cdef32401380e60c0928b1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Sep 2022 13:34:00 +0200 Subject: evm: add post set acl hook The security_inode_post_setxattr() hook is used by security modules to update their own security.* xattrs. Consequently none of the security modules operate on posix acls. So we don't need an additional security hook when post setting posix acls. However, the integrity subsystem wants to be informed about posix acl changes in order to reset the EVM status flag. -> evm_inode_post_setxattr() -> evm_update_evmxattr() -> evm_calc_hmac() -> evm_calc_hmac_or_hash() and evm_cacl_hmac_or_hash() walks the global list of protected xattr names evm_config_xattrnames. This global list can be modified via /sys/security/integrity/evm/evm_xattrs. The write to "evm_xattrs" is restricted to security.* xattrs and the default xattrs in evm_config_xattrnames only contains security.* xattrs as well. So the actual value for posix acls is currently completely irrelevant for evm during evm_inode_post_setxattr() and frankly it should stay that way in the future to not cause the vfs any more headaches. But if the actual posix acl values matter then evm shouldn't operate on the binary void blob and try to hack around in the uapi struct anyway. Instead it should then in the future add a dedicated hook which takes a struct posix_acl argument passing the posix acls in the proper vfs format. For now it is sufficient to make evm_inode_post_set_acl() a wrapper around evm_inode_post_setxattr() not passing any actual values down. This will cause the hashes to be updated as before. Reviewed-by: Paul Moore Signed-off-by: Christian Brauner (Microsoft) --- include/linux/evm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 86139be48992..117ac01b2432 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -44,6 +44,12 @@ static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, { return evm_inode_set_acl(mnt_userns, dentry, acl_name, NULL); } +static inline void evm_inode_post_set_acl(struct dentry *dentry, + const char *acl_name, + struct posix_acl *kacl) +{ + return evm_inode_post_setxattr(dentry, acl_name, NULL, 0); +} extern int evm_inode_init_security(struct inode *inode, const struct xattr *xattr_array, struct xattr *evm); @@ -131,6 +137,13 @@ static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, return 0; } +static inline void evm_inode_post_set_acl(struct dentry *dentry, + const char *acl_name, + struct posix_acl *kacl) +{ + return; +} + static inline int evm_inode_init_security(struct inode *inode, const struct xattr *xattr_array, struct xattr *evm) -- cgit From e4cc9163032fed6ff27dd03325ddc54f88863a24 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:06 +0200 Subject: acl: add vfs_set_acl() In previous patches we implemented get and set inode operations for all non-stacking filesystems that support posix acls but didn't yet implement get and/or set acl inode operations. This specifically affected cifs and 9p. Now we can build a posix acl api based solely on get and set inode operations. We add a new vfs_set_acl() api that can be used to set posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. After we finished building the vfs api we can switch stacking filesystems to rely on the new posix api and then finally switch the xattr system calls themselves to rely on the posix acl api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl.h | 10 ++++++++++ include/linux/posix_acl_xattr.h | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 07e171b4428a..316b05c1dc97 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -99,6 +99,9 @@ static inline void cache_no_acl(struct inode *inode) inode->i_acl = NULL; inode->i_default_acl = NULL; } + +int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, struct posix_acl *kacl); #else static inline int posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode) @@ -126,6 +129,13 @@ static inline int posix_acl_create(struct inode *inode, umode_t *mode, static inline void forget_all_cached_acls(struct inode *inode) { } + +static inline int vfs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *name, + struct posix_acl *acl) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index ebfa11ac7046..b86b7f170d43 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -72,6 +72,16 @@ static inline const char *posix_acl_xattr_name(int type) return ""; } +static inline int posix_acl_type(const char *name) +{ + if (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) + return ACL_TYPE_ACCESS; + else if (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0) + return ACL_TYPE_DEFAULT; + + return -1; +} + extern const struct xattr_handler posix_acl_access_xattr_handler; extern const struct xattr_handler posix_acl_default_xattr_handler; -- cgit From 4f353ba4a9f42ad283dc6afdd84dae0b1d294842 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:13 +0200 Subject: acl: add vfs_get_acl() In previous patches we implemented get and set inode operations for all non-stacking filesystems that support posix acls but didn't yet implement get and/or set acl inode operations. This specifically affected cifs and 9p. Now we can build a posix acl api based solely on get and set inode operations. We add a new vfs_get_acl() api that can be used to get posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. After we finished building the vfs api we can switch stacking filesystems to rely on the new posix api and then finally switch the xattr system calls themselves to rely on the posix acl api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl.h | 9 +++++++++ include/linux/posix_acl_xattr.h | 10 ++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 316b05c1dc97..86aec1efc80e 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -102,6 +102,8 @@ static inline void cache_no_acl(struct inode *inode) int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); +struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name); #else static inline int posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode) @@ -136,6 +138,13 @@ static inline int vfs_set_acl(struct user_namespace *mnt_userns, { return -EOPNOTSUPP; } + +static inline struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index b86b7f170d43..bf30296389d7 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -38,6 +38,9 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size); void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, const struct inode *inode, void *value, size_t size); +ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, + struct inode *inode, const struct posix_acl *acl, + void *buffer, size_t size); #else static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) { @@ -51,6 +54,13 @@ posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, size_t size) { } +static inline ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, + struct inode *inode, + const struct posix_acl *acl, + void *buffer, size_t size) +{ + return 0; +} #endif struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, -- cgit From aeb7f00542af48ac63e448de46d672cfd79a7069 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:14 +0200 Subject: acl: add vfs_remove_acl() In previous patches we implemented get and set inode operations for all non-stacking filesystems that support posix acls but didn't yet implement get and/or set acl inode operations. This specifically affected cifs and 9p. Now we can build a posix acl api based solely on get and set inode operations. We add a new vfs_remove_acl() api that can be used to set posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. After we finished building the vfs api we can switch stacking filesystems to rely on the new posix api and then finally switch the xattr system calls themselves to rely on the posix acl api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- include/linux/evm.h | 13 +++++++++++++ include/linux/posix_acl.h | 8 ++++++++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 117ac01b2432..7a9ee2157f69 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -35,6 +35,12 @@ extern int evm_inode_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name); +static inline void evm_inode_post_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + evm_inode_post_removexattr(dentry, acl_name); +} extern int evm_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); @@ -123,6 +129,13 @@ static inline void evm_inode_post_removexattr(struct dentry *dentry, return; } +static inline void evm_inode_post_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, + const char *acl_name) +{ + return; +} + static inline int evm_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 86aec1efc80e..ee608d22ecb9 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -104,6 +104,8 @@ int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, const char *acl_name); +int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name); #else static inline int posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode) @@ -145,6 +147,12 @@ static inline struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, { return ERR_PTR(-EOPNOTSUPP); } + +static inline int vfs_remove_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); -- cgit From 31acceb97500dd6e9105526301d76488cd6ca21c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:21 +0200 Subject: ovl: use posix acl api Now that posix acls have a proper api us it to copy them. All filesystems that can serve as lower or upper layers for overlayfs have gained support for the new posix acl api in previous patches. So switch all internal overlayfs codepaths for copying posix acls to the new posix acl api. Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner (Microsoft) --- include/linux/xattr.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 4c379d23ec6e..c5238744bab9 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -22,6 +22,12 @@ struct inode; struct dentry; +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and -- cgit From 318e66856ddec05384f32d60b5598128289f4e7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:22 +0200 Subject: xattr: use posix acl api In previous patches we built a new posix api solely around get and set inode operations. Now that we have all the pieces in place we can switch the system calls and the vfs over to only rely on this api when interacting with posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. With the new posix acl api we immediately translate into an appropriate kernel internal struct posix_acl format both when getting and setting posix acls. This is a stark contrast to before were we hacked unsafe raw values into the uapi struct that was stored in a void pointer relying and having filesystems and security modules hack around in the uapi struct as well. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl_xattr.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index bf30296389d7..c5d5fbc348dc 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -33,6 +33,8 @@ posix_acl_xattr_count(size_t size) } #ifdef CONFIG_FS_POSIX_ACL +struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size); void posix_acl_fix_xattr_from_user(void *value, size_t size); void posix_acl_fix_xattr_to_user(void *value, size_t size); void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, @@ -42,6 +44,12 @@ ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, struct inode *inode, const struct posix_acl *acl, void *buffer, size_t size); #else +static inline struct posix_acl * +posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, + size_t size) +{ + return ERR_PTR(-EOPNOTSUPP); +} static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) { } @@ -63,8 +71,6 @@ static inline ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, } #endif -struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size); int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size); struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, -- cgit From 0a26bde2c9db9817e2b4c0f890236f78d4d8ed7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:27 +0200 Subject: acl: remove a slew of now unused helpers Now that the posix acl api is active we can remove all the hacky helpers we had to keep around for all these years and also remove the set and get posix acl xattr handler methods as they aren't needed anymore. Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl_xattr.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index c5d5fbc348dc..8daac3c5b583 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -35,11 +35,6 @@ posix_acl_xattr_count(size_t size) #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, size_t size); -void posix_acl_fix_xattr_from_user(void *value, size_t size); -void posix_acl_fix_xattr_to_user(void *value, size_t size); -void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size); ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, struct inode *inode, const struct posix_acl *acl, void *buffer, size_t size); @@ -50,18 +45,6 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, { return ERR_PTR(-EOPNOTSUPP); } -static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ -} -static inline void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ -} -static inline void -posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, void *value, - size_t size) -{ -} static inline ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, struct inode *inode, const struct posix_acl *acl, @@ -73,9 +56,6 @@ static inline ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size); -struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const void *value, size_t size); static inline const char *posix_acl_xattr_name(int type) { switch (type) { -- cgit From 7ebe49b76a001b10b007193b1771f33e6cbc4f3f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 16 Oct 2022 12:41:26 +0200 Subject: driver core: allow kobj_to_dev() to take a const pointer If a const * to a kobject is passed to kobj_to_dev(), we want to return back a const * to a device as the driver core shouldn't be modifying a constant structure. But when dealing with container_of() the pointer const attribute is cast away, so we need to manually handle this by determining the type of the pointer passed in to know the type of the pointer to pass out. Luckily _Generic can do this type of magic, and as the kernel now supports C11 it is availble to us to handle this type of build-time type detection. Cc: "Rafael J. Wysocki" Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221016104126.1259809-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 424b55df0272..023ea50b1916 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -680,11 +680,27 @@ struct device_link { bool supplier_preactivated; /* Owned by consumer probe. */ }; -static inline struct device *kobj_to_dev(struct kobject *kobj) +static inline struct device *__kobj_to_dev(struct kobject *kobj) { return container_of(kobj, struct device, kobj); } +static inline const struct device *__kobj_to_dev_const(const struct kobject *kobj) +{ + return container_of(kobj, const struct device, kobj); +} + +/* + * container_of() will happily take a const * and spit back a non-const * as it + * is just doing pointer math. But we want to be a bit more careful in the + * driver code, so manually force any const * of a kobject to also be a const * + * to a device. + */ +#define kobj_to_dev(kobj) \ + _Generic((kobj), \ + const struct kobject *: __kobj_to_dev_const, \ + struct kobject *: __kobj_to_dev)(kobj) + /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU -- cgit From 593efa4091f5f05c224f8b7fd204d18dbff97e31 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 16 Oct 2022 12:41:55 +0200 Subject: USB: allow some usb functions to take a const pointer. The functions to_usb_interface(), to_usb_device, and interface_to_usbdev() sometimes would like to take a const * and return a const * back. As we are doing pointer math, a call to container_of() loses the const-ness of a pointer, so use a _Generic() macro to pick the proper inline function to call instead. Link: https://lore.kernel.org/r/20221016104155.1260201-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 9ff1ad4dfad1..3a55131e0ad4 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -258,7 +258,27 @@ struct usb_interface { struct device *usb_dev; struct work_struct reset_ws; /* for resets in atomic context */ }; -#define to_usb_interface(d) container_of(d, struct usb_interface, dev) + +static inline struct usb_interface *__to_usb_interface(struct device *d) +{ + return container_of(d, struct usb_interface, dev); +} + +static inline const struct usb_interface *__to_usb_interface_const(const struct device *d) +{ + return container_of(d, struct usb_interface, dev); +} + +/* + * container_of() will happily take a const * and spit back a non-const * as it + * is just doing pointer math. But we want to be a bit more careful in the USB + * driver code, so manually force any const * of a device to also be a const * + * to a usb_device. + */ +#define to_usb_interface(dev) \ + _Generic((dev), \ + const struct device *: __to_usb_interface_const, \ + struct device *: __to_usb_interface)(dev) static inline void *usb_get_intfdata(struct usb_interface *intf) { @@ -709,12 +729,41 @@ struct usb_device { u16 hub_delay; unsigned use_generic_driver:1; }; -#define to_usb_device(d) container_of(d, struct usb_device, dev) -static inline struct usb_device *interface_to_usbdev(struct usb_interface *intf) +static inline struct usb_device *__to_usb_device(struct device *d) +{ + return container_of(d, struct usb_device, dev); +} + +static inline const struct usb_device *__to_usb_device_const(const struct device *d) +{ + return container_of(d, struct usb_device, dev); +} + +/* + * container_of() will happily take a const * and spit back a non-const * as it + * is just doing pointer math. But we want to be a bit more careful in the USB + * driver code, so manually force any const * of a device to also be a const * + * to a usb_device. + */ +#define to_usb_device(dev) \ + _Generic((dev), \ + const struct device *: __to_usb_device_const, \ + struct device *: __to_usb_device)(dev) + +static inline struct usb_device *__intf_to_usbdev(struct usb_interface *intf) { return to_usb_device(intf->dev.parent); } +static inline const struct usb_device *__intf_to_usbdev_const(const struct usb_interface *intf) +{ + return to_usb_device((const struct device *)intf->dev.parent); +} + +#define interface_to_usbdev(intf) \ + _Generic((intf), \ + const struct usb_interface *: __intf_to_usbdev_const, \ + struct usb_interface *: __intf_to_usbdev)(intf) extern struct usb_device *usb_get_dev(struct usb_device *dev); extern void usb_put_dev(struct usb_device *dev); -- cgit From 5033ac5c580cb22245a0c2b9e53d508e8fdd50d8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Oct 2022 18:51:28 +0200 Subject: USB: make devnode() callback in usb_class_driver take a const * With the changes to the driver core to make more pointers const, the USB subsystem also needs to be modified to take a const * for the devnode callback so that the driver core's constant pointer will also be properly propagated. Cc: Benjamin Tissoires Cc: Juergen Stuber Reviewed-by: Johan Hovold Acked-by: Pete Zaitcev Reviewed-by: Jiri Kosina Link: https://lore.kernel.org/r/20221001165128.2688526-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 3a55131e0ad4..4b463a5e4ba2 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1321,7 +1321,7 @@ struct usb_device_driver { */ struct usb_class_driver { char *name; - char *(*devnode)(struct device *dev, umode_t *mode); + char *(*devnode)(const struct device *dev, umode_t *mode); const struct file_operations *fops; int minor_base; }; -- cgit From 2e83b879fb91dafe995967b46a1d38a5b0889242 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Sep 2022 14:29:07 -0700 Subject: srcu: Create an srcu_read_lock_nmisafe() and srcu_read_unlock_nmisafe() On strict load-store architectures, the use of this_cpu_inc() by srcu_read_lock() and srcu_read_unlock() is not NMI-safe in TREE SRCU. To see this suppose that an NMI arrives in the middle of srcu_read_lock(), just after it has read ->srcu_lock_count, but before it has written the incremented value back to memory. If that NMI handler also does srcu_read_lock() and srcu_read_lock() on that same srcu_struct structure, then upon return from that NMI handler, the interrupted srcu_read_lock() will overwrite the NMI handler's update to ->srcu_lock_count, but leave unchanged the NMI handler's update by srcu_read_unlock() to ->srcu_unlock_count. This can result in a too-short SRCU grace period, which can in turn result in arbitrary memory corruption. If the NMI handler instead interrupts the srcu_read_unlock(), this can result in eternal SRCU grace periods, which is not much better. This commit therefore creates a pair of new srcu_read_lock_nmisafe() and srcu_read_unlock_nmisafe() functions, which allow SRCU readers in both NMI handlers and in process and IRQ context. It is bad practice to mix the existing and the new _nmisafe() primitives on the same srcu_struct structure. Use one set or the other, not both. Just to underline that "bad practice" point, using srcu_read_lock() at process level and srcu_read_lock_nmisafe() in your NMI handler will not, repeat NOT, work. If you do not immediately understand why this is the case, please review the earlier paragraphs in this commit log. [ paulmck: Apply kernel test robot feedback. ] [ paulmck: Apply feedback from Randy Dunlap. ] [ paulmck: Apply feedback from John Ogness. ] [ paulmck: Apply feedback from Frederic Weisbecker. ] Link: https://lore.kernel.org/all/20220910221947.171557773@linutronix.de/ Signed-off-by: Paul E. McKenney Acked-by: Randy Dunlap # build-tested Reviewed-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: John Ogness Cc: Petr Mladek --- include/linux/srcu.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 01226e4d960a..4fac088072c3 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -64,6 +64,20 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); +#ifdef CONFIG_NEED_SRCU_NMI_SAFE +int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); +#else +static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) +{ + return __srcu_read_lock(ssp); +} +static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) +{ + __srcu_read_unlock(ssp, idx); +} +#endif /* CONFIG_NEED_SRCU_NMI_SAFE */ + #ifdef CONFIG_SRCU void srcu_init(void); #else /* #ifdef CONFIG_SRCU */ @@ -166,6 +180,25 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) return retval; } +/** + * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but in an NMI-safe manner. + * See srcu_read_lock() for more information. + */ +static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) +{ + int retval; + + if (IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + retval = __srcu_read_lock_nmisafe(ssp); + else + retval = __srcu_read_lock(ssp); + rcu_lock_acquire(&(ssp)->dep_map); + return retval; +} + /* Used by tracing, cannot be traced and cannot invoke lockdep. */ static inline notrace int srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) @@ -191,6 +224,24 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __srcu_read_unlock(ssp, idx); } +/** + * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Exit an SRCU read-side critical section, but in an NMI-safe manner. + */ +static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) + __releases(ssp) +{ + WARN_ON_ONCE(idx & ~0x1); + rcu_lock_release(&(ssp)->dep_map); + if (IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + __srcu_read_unlock_nmisafe(ssp, idx); + else + __srcu_read_unlock(ssp, idx); +} + /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) -- cgit From 27120e7d2c4d5c438b76f9c6330037a52ad0722e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Sep 2022 14:03:07 -0700 Subject: srcu: Check for consistent per-CPU per-srcu_struct NMI safety This commit adds runtime checks to verify that a given srcu_struct uses consistent NMI-safe (or not) read-side primitives on a per-CPU basis. Link: https://lore.kernel.org/all/20220910221947.171557773@linutronix.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: John Ogness Cc: Petr Mladek --- include/linux/srcu.h | 12 ++++++------ include/linux/srcutree.h | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4fac088072c3..1a7840c1b87a 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -65,14 +65,14 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); #ifdef CONFIG_NEED_SRCU_NMI_SAFE -int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); +int __srcu_read_lock_nmisafe(struct srcu_struct *ssp, bool chknmisafe) __acquires(ssp); +void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx, bool chknmisafe) __releases(ssp); #else -static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) +static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp, bool chknmisafe) { return __srcu_read_lock(ssp); } -static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) +static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx, bool chknmisafe) { __srcu_read_unlock(ssp, idx); } @@ -192,7 +192,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp int retval; if (IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - retval = __srcu_read_lock_nmisafe(ssp); + retval = __srcu_read_lock_nmisafe(ssp, true); else retval = __srcu_read_lock(ssp); rcu_lock_acquire(&(ssp)->dep_map); @@ -237,7 +237,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) WARN_ON_ONCE(idx & ~0x1); rcu_lock_release(&(ssp)->dep_map); if (IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - __srcu_read_unlock_nmisafe(ssp, idx); + __srcu_read_unlock_nmisafe(ssp, idx, true); else __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 0c4eca07d78d..1ef8f2a4884f 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -25,6 +25,7 @@ struct srcu_data { /* Read-side state. */ atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ + int srcu_nmi_safety; /* NMI-safe srcu_struct structure? */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; @@ -42,6 +43,10 @@ struct srcu_data { struct srcu_struct *ssp; }; +#define SRCU_NMI_UNKNOWN 0x0 +#define SRCU_NMI_NMI_UNSAFE 0x1 +#define SRCU_NMI_NMI_SAFE 0x2 + /* * Node in SRCU combining tree, similar in function to rcu_data. */ -- cgit From 67776a9ee69512b144250f1b9fbce4db76c0f3f8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 18 Oct 2022 21:10:52 +0200 Subject: ata: remove unused helper ata_id_lba48_enabled() Not only is this function unused, but even worse, the bit it is checking is actually used for signaling if the feature is supported, not enabled. Therefore, remove the unused helper function ata_id_lba48_enabled(). ata_id_has_lba48() is left unmodified, since this extra supported bit (Bit 10 of word 86) is simply a copy of the bit that ata_id_has_lba48() already checks (Bit 10 of word 83), see ACS-5 r10: 7.13.6.41 Words 85..87, 120: Commands and feature sets supported or enabled Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index e3050e153a71..c04aca58448a 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -698,15 +698,6 @@ static inline bool ata_id_has_lba48(const u16 *id) return id[ATA_ID_COMMAND_SET_2] & (1 << 10); } -static inline bool ata_id_lba48_enabled(const u16 *id) -{ - if (ata_id_has_lba48(id) == 0) - return false; - if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000) - return false; - return id[ATA_ID_CFS_ENABLE_2] & (1 << 10); -} - static inline bool ata_id_hpa_enabled(const u16 *id) { /* Yes children, word 83 valid bits cover word 82 data */ -- cgit From 73eb5507fa5f86cd1b4e1a75348ff2e0d620c662 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 18 Oct 2022 21:10:53 +0200 Subject: ata: remove unused helper ata_id_flush_enabled() Not only is this function unused, but even worse, the bit it is checking is actually used for signaling if the feature is supported, not enabled. Therefore, remove the unused helper function ata_id_flush_enabled(). ata_id_has_flush() is left unmodified, since this extra supported bit (Bit 12 of word 86) is simply a copy of the bit that ata_id_has_flush() already checks (Bit 12 of word 83), see ACS-5 r10: 7.13.6.41 Words 85..87, 120: Commands and feature sets supported or enabled Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index c04aca58448a..b32d62542e26 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -615,15 +615,6 @@ static inline bool ata_id_has_flush(const u16 *id) return id[ATA_ID_COMMAND_SET_2] & (1 << 12); } -static inline bool ata_id_flush_enabled(const u16 *id) -{ - if (ata_id_has_flush(id) == 0) - return false; - if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000) - return false; - return id[ATA_ID_CFS_ENABLE_2] & (1 << 12); -} - static inline bool ata_id_has_flush_ext(const u16 *id) { if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000) -- cgit From 90c313d353051c0eb5f6bbcb4b0f4cc0242f030c Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 18 Oct 2022 21:10:54 +0200 Subject: ata: remove unused helper ata_id_flush_ext_enabled() Not only is this function unused, but even worse, the bit it is checking is actually used for signaling if the feature is supported, not enabled. Therefore, remove the unused helper function ata_id_flush_ext_enabled(). ata_id_has_flush_ext() is left unmodified, since this extra supported bit (Bit 13 of word 86) is simply a copy of the bit that ata_id_has_flush_ext() already checks (Bit 13 of word 83), see ACS-5 r10: 7.13.6.41 Words 85..87, 120: Commands and feature sets supported or enabled Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index b32d62542e26..0c18499f60b6 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -622,19 +622,6 @@ static inline bool ata_id_has_flush_ext(const u16 *id) return id[ATA_ID_COMMAND_SET_2] & (1 << 13); } -static inline bool ata_id_flush_ext_enabled(const u16 *id) -{ - if (ata_id_has_flush_ext(id) == 0) - return false; - if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000) - return false; - /* - * some Maxtor disks have bit 13 defined incorrectly - * so check bit 10 too - */ - return (id[ATA_ID_CFS_ENABLE_2] & 0x2400) == 0x2400; -} - static inline u32 ata_id_logical_sector_size(const u16 *id) { /* T13/1699-D Revision 6a, Sep 6, 2008. Page 128. -- cgit From aea672d054a21782ed8450c75febb6ba3c208ca4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 20 Oct 2022 22:54:21 +0300 Subject: spi: Introduce spi_get_device_match_data() helper The proposed spi_get_device_match_data() helper is for retrieving a driver data associated with the ID in an ID table. First, it tries to get driver data of the device enumerated by firmware interface (usually Device Tree or ACPI). If none is found it falls back to the SPI ID table matching. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221020195421.10482-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index fbf8c0d95968..8fe3d0a9d2c9 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1514,6 +1514,9 @@ extern void spi_unregister_device(struct spi_device *spi); extern const struct spi_device_id * spi_get_device_id(const struct spi_device *sdev); +extern const void * +spi_get_device_match_data(const struct spi_device *sdev); + static inline bool spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) { -- cgit From e9f8a790bf682bb4a2f79d0d905b34d55bceb71d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Sep 2022 08:57:56 -0700 Subject: slab: Explain why SLAB_TYPESAFE_BY_RCU reference before locking It is not obvious to the casual user why it is absolutely necessary to acquire a reference to a SLAB_TYPESAFE_BY_RCU structure before acquiring a lock in that structure. Therefore, add a comment explaining this point. [ paulmck: Apply Vlastimil Babka feedback. ] Signed-off-by: Paul E. McKenney Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Acked-by: Vlastimil Babka --- include/linux/slab.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 90877fcde70b..487418c7ea8c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -76,6 +76,17 @@ * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * + * Note that it is not possible to acquire a lock within a structure + * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference + * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages + * are not zeroed before being given to the slab, which means that any + * locks must be initialized after each and every kmem_struct_alloc(). + * Alternatively, make the ctor passed to kmem_cache_create() initialize + * the locks at page-allocation time, as is done in __i915_request_ctor(), + * sighand_ctor(), and anon_vma_ctor(). Such a ctor permits readers + * to safely acquire those ctor-initialized locks under rcu_read_lock() + * protection. + * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ /* Defer freeing slabs to RCU */ -- cgit From fdbdb868454a3e83996cf1500c6f7ba73c07a03c Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Mon, 26 Sep 2022 09:58:27 +0800 Subject: rcu: Remove rcu_is_idle_cpu() The commit 3fcd6a230fa7 ("x86/cpu: Avoid cpuinfo-induced IPIing of idle CPUs") introduced rcu_is_idle_cpu() in order to identify the current CPU idle state. But commit f3eca381bd49 ("x86/aperfmperf: Replace arch_freq_get_on_cpu()") switched to using MAX_SAMPLE_AGE, so rcu_is_idle_cpu() is no longer used. This commit therefore removes it. Fixes: f3eca381bd49 ("x86/aperfmperf: Replace arch_freq_get_on_cpu()") Signed-off-by: Yipeng Zou Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 2 -- include/linux/rcutree.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 9bc025aa79a3..5c271bf3a1e7 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -146,8 +146,6 @@ static inline void rcu_virt_note_context_switch(void) { } static inline void rcu_cpu_stall_reset(void) { } static inline int rcu_jiffies_till_stall_check(void) { return 21 * HZ; } static inline void rcu_irq_exit_check_preempt(void) { } -#define rcu_is_idle_cpu(cpu) \ - (is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq()) static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 70795386b9ff..4003bf6cfa1c 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -87,8 +87,6 @@ bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); -bool rcu_is_idle_cpu(int cpu); - #ifdef CONFIG_PROVE_RCU void rcu_irq_exit_check_preempt(void); #else -- cgit From e29a4915db1480f96e0bc2e928699d086a71f43c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 13 Oct 2022 19:22:44 +0200 Subject: srcu: Debug NMI safety even on archs that don't require it Currently the NMI safety debugging is only performed on architectures that don't support NMI-safe this_cpu_inc(). Reorder the code so that other architectures like x86 also detect bad uses. [ paulmck: Apply kernel test robot, Stephen Rothwell, and Zqiang feedback. ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 36 ++++++++++++++++++++++++------------ include/linux/srcutree.h | 4 ---- 2 files changed, 24 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 1a7840c1b87a..f0814ffca34b 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -65,14 +65,14 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); #ifdef CONFIG_NEED_SRCU_NMI_SAFE -int __srcu_read_lock_nmisafe(struct srcu_struct *ssp, bool chknmisafe) __acquires(ssp); -void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx, bool chknmisafe) __releases(ssp); +int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); #else -static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp, bool chknmisafe) +static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { return __srcu_read_lock(ssp); } -static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx, bool chknmisafe) +static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { __srcu_read_unlock(ssp, idx); } @@ -118,6 +118,18 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#define SRCU_NMI_UNKNOWN 0x0 +#define SRCU_NMI_UNSAFE 0x1 +#define SRCU_NMI_SAFE 0x2 + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) +void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe); +#else +static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, + bool nmi_safe) { } +#endif + + /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing @@ -175,6 +187,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; + srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); rcu_lock_acquire(&(ssp)->dep_map); return retval; @@ -191,10 +204,8 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp { int retval; - if (IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - retval = __srcu_read_lock_nmisafe(ssp, true); - else - retval = __srcu_read_lock(ssp); + srcu_check_nmi_safety(ssp, true); + retval = __srcu_read_lock_nmisafe(ssp); rcu_lock_acquire(&(ssp)->dep_map); return retval; } @@ -205,6 +216,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; + srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); return retval; } @@ -220,6 +232,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); + srcu_check_nmi_safety(ssp, false); rcu_lock_release(&(ssp)->dep_map); __srcu_read_unlock(ssp, idx); } @@ -235,17 +248,16 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); + srcu_check_nmi_safety(ssp, true); rcu_lock_release(&(ssp)->dep_map); - if (IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - __srcu_read_unlock_nmisafe(ssp, idx, true); - else - __srcu_read_unlock(ssp, idx); + __srcu_read_unlock_nmisafe(ssp, idx); } /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { + srcu_check_nmi_safety(ssp, false); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1ef8f2a4884f..c689a81752c9 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -43,10 +43,6 @@ struct srcu_data { struct srcu_struct *ssp; }; -#define SRCU_NMI_UNKNOWN 0x0 -#define SRCU_NMI_NMI_UNSAFE 0x1 -#define SRCU_NMI_NMI_SAFE 0x2 - /* * Node in SRCU combining tree, similar in function to rcu_data. */ -- cgit From 33a0a1e3b3d17445832177981dc7a1c6a5b009f8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Oct 2022 18:53:15 +0200 Subject: kobject: modify kobject_get_path() to take a const * kobject_get_path() does not modify the kobject passed to it, so make the pointer constant. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20221001165315.2690141-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 57fb972fea05..592f9785b058 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -115,7 +115,7 @@ extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); extern void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid); -extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); +extern char *kobject_get_path(const struct kobject *kobj, gfp_t flag); struct kobj_type { void (*release)(struct kobject *kobj); -- cgit From 3d24903a6dd27ab817b4c6c24bee245ff06f7c8e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Oct 2022 09:23:10 +0200 Subject: kobject: make get_ktype() take a const pointer get_ktype() does not modify the structure passed to it, so mark the parameter as being const to allow other const structures to be passed to it in the future. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20221021072310.3931690-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 592f9785b058..fc40fc81aeb1 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -198,7 +198,7 @@ static inline void kset_put(struct kset *k) kobject_put(&k->kobj); } -static inline const struct kobj_type *get_ktype(struct kobject *kobj) +static inline const struct kobj_type *get_ktype(const struct kobject *kobj) { return kobj->ktype; } -- cgit From b295d484b97081feba72b071ffcb72fb4638ccfd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Oct 2022 12:21:25 +0300 Subject: device property: Allow const parameter to dev_fwnode() It's not fully correct to take a const parameter pointer to a struct and return a non-const pointer to a member of that struct. Instead, introduce a const version of the dev_fwnode() API which takes and returns const pointers and use it where it's applicable. With this, convert dev_fwnode() to be a macro wrapper on top of const and non-const APIs that chooses one based on the type. Suggested-by: Sakari Ailus Fixes: aade55c86033 ("device property: Add const qualifier to device_get_match_data() parameter") Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20221004092129.19412-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 117cc200c656..587b5b666b5b 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -32,7 +32,12 @@ enum dev_dma_attr { DEV_DMA_COHERENT, }; -struct fwnode_handle *dev_fwnode(const struct device *dev); +const struct fwnode_handle *__dev_fwnode_const(const struct device *dev); +struct fwnode_handle *__dev_fwnode(struct device *dev); +#define dev_fwnode(dev) \ + _Generic((dev), \ + const struct device *: __dev_fwnode_const, \ + struct device *: __dev_fwnode)(dev) bool device_property_present(struct device *dev, const char *propname); int device_property_read_u8_array(struct device *dev, const char *propname, -- cgit From 23ead33bc6ed62abc9adc5fe27b9911e2ef5d209 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Oct 2022 12:21:26 +0300 Subject: device property: Constify fwnode connection match APIs The fwnode and device parameters are not altered in the fwnode connection match APIs, constify them. Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20221004092129.19412-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 587b5b666b5b..8d82775a901a 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -442,21 +442,21 @@ unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode, int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint); -typedef void *(*devcon_match_fn_t)(struct fwnode_handle *fwnode, const char *id, +typedef void *(*devcon_match_fn_t)(const struct fwnode_handle *fwnode, const char *id, void *data); -void *fwnode_connection_find_match(struct fwnode_handle *fwnode, +void *fwnode_connection_find_match(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match); -static inline void *device_connection_find_match(struct device *dev, +static inline void *device_connection_find_match(const struct device *dev, const char *con_id, void *data, devcon_match_fn_t match) { return fwnode_connection_find_match(dev_fwnode(dev), con_id, data, match); } -int fwnode_connection_find_matches(struct fwnode_handle *fwnode, +int fwnode_connection_find_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len); -- cgit From a1bfed6094ac6868c43aaa43d021bf562cd93d07 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Oct 2022 12:21:27 +0300 Subject: device property: Constify parameter in fwnode_graph_is_endpoint() Constify parameter in fwnode_graph_is_endpoint() since it doesn't alter anything related to it. Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20221004092129.19412-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 8d82775a901a..7abb1792044f 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -410,7 +410,7 @@ struct fwnode_handle *fwnode_graph_get_remote_port( struct fwnode_handle *fwnode_graph_get_remote_endpoint( const struct fwnode_handle *fwnode); -static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) +static inline bool fwnode_graph_is_endpoint(const struct fwnode_handle *fwnode) { return fwnode_property_present(fwnode, "remote-endpoint"); } -- cgit From 7952cd2b8213f20a1752634c25dfd215da537722 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Oct 2022 12:21:28 +0300 Subject: device property: Constify device child node APIs The device parameter is not altered in the device child node APIs, constify them. Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20221004092129.19412-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 7abb1792044f..472689e53ade 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -114,16 +114,16 @@ struct fwnode_handle *fwnode_get_next_available_child_node( for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\ child = fwnode_get_next_available_child_node(fwnode, child)) -struct fwnode_handle *device_get_next_child_node( - struct device *dev, struct fwnode_handle *child); +struct fwnode_handle *device_get_next_child_node(const struct device *dev, + struct fwnode_handle *child); #define device_for_each_child_node(dev, child) \ for (child = device_get_next_child_node(dev, NULL); child; \ child = device_get_next_child_node(dev, child)) -struct fwnode_handle *fwnode_get_named_child_node( - const struct fwnode_handle *fwnode, const char *childname); -struct fwnode_handle *device_get_named_child_node(struct device *dev, +struct fwnode_handle *fwnode_get_named_child_node(const struct fwnode_handle *fwnode, + const char *childname); +struct fwnode_handle *device_get_named_child_node(const struct device *dev, const char *childname); struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode); @@ -132,7 +132,7 @@ void fwnode_handle_put(struct fwnode_handle *fwnode); int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index); int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name); -unsigned int device_get_child_node_count(struct device *dev); +unsigned int device_get_child_node_count(const struct device *dev); static inline bool device_property_read_bool(struct device *dev, const char *propname) -- cgit From 59789f3418dd3c0a187490d49e900a59a5c8d732 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Oct 2022 12:21:29 +0300 Subject: device property: Constify parameter in device_dma_supported() and device_get_dma_attr() Constify parameter in device_dma_supported() and device_get_dma_attr() since they don't alter anything related to it. Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20221004092129.19412-6-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 472689e53ade..83674f968a8f 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -388,9 +388,8 @@ property_entries_dup(const struct property_entry *properties); void property_entries_free(const struct property_entry *properties); -bool device_dma_supported(struct device *dev); - -enum dev_dma_attr device_get_dma_attr(struct device *dev); +bool device_dma_supported(const struct device *dev); +enum dev_dma_attr device_get_dma_attr(const struct device *dev); const void *device_get_match_data(const struct device *dev); -- cgit From 6a542d1d5f6c814fd3643b43e85b21757c1e363b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2020 12:21:07 -0400 Subject: kill signal_pt_regs() Once upon at it was used on hot paths, but that had not been true since 2013. IOW, there's no point for arch-optimized equivalent of task_pt_regs(current) - remaining two users are not worth bothering with. Signed-off-by: Al Viro --- include/linux/ptrace.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index c952c5ba8fab..eaaef3ffec22 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -389,15 +389,6 @@ static inline void user_single_step_report(struct pt_regs *regs) #define current_pt_regs() task_pt_regs(current) #endif -/* - * unlike current_pt_regs(), this one is equal to task_pt_regs(current) - * on *all* architectures; the only reason to have a per-arch definition - * is optimisation. - */ -#ifndef signal_pt_regs -#define signal_pt_regs() task_pt_regs(current) -#endif - #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif -- cgit From 9a938eba8d284fba0daff62142dece74ae3c16de Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2020 12:25:39 -0400 Subject: kill coredump_params->regs it's always task_pt_regs(current) Signed-off-by: Al Viro --- include/linux/coredump.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 08a1d3e7e46d..a0655d7c149c 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -18,7 +18,6 @@ struct core_vma_metadata { struct coredump_params { const kernel_siginfo_t *siginfo; - struct pt_regs *regs; struct file *file; unsigned long limit; unsigned long mm_flags; -- cgit From fcf1492d6697fb22a4328260b0c76be12ed3badd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Sep 2022 17:15:38 -0400 Subject: elf_core_copy_task_regs(): task_pt_regs is defined everywhere Had been since 2011 for all live architectures, ever since 2013 for all architectures, period. Signed-off-by: Al Viro --- include/linux/elfcore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 346a8b56cdc8..fcf58e16d1e3 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -88,7 +88,7 @@ static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* { #if defined (ELF_CORE_COPY_TASK_REGS) return ELF_CORE_COPY_TASK_REGS(t, elfregs); -#elif defined (task_pt_regs) +#else elf_core_copy_regs(elfregs, task_pt_regs(t)); #endif return 0; -- cgit From 2d4daa549c17b6ba4845a751c7a78d3b2419d78f Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 27 Sep 2022 15:16:36 -0500 Subject: x86/resctrl: Remove arch_has_empty_bitmaps The field arch_has_empty_bitmaps is not required anymore. The field min_cbm_bits is enough to validate the CBM (capacity bit mask) if the architecture can support the zero CBM or not. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/166430979654.372014.615622285687642644.stgit@bmoger-ubuntu --- include/linux/resctrl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 0cf5b20c6ddf..0cee154abc9f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -89,11 +89,12 @@ struct rdt_domain { /** * struct resctrl_cache - Cache allocation related data * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set + * @min_cbm_bits: Minimum number of consecutive bits to be set. + * The value 0 means the architecture can support + * zero CBM. * @shareable_bits: Bitmask of shareable resource with other * executing entities * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid. - * @arch_has_empty_bitmaps: True if the '0' bitmap is valid. * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache * level has CPU scope. */ @@ -102,7 +103,6 @@ struct resctrl_cache { unsigned int min_cbm_bits; unsigned int shareable_bits; bool arch_has_sparse_bitmaps; - bool arch_has_empty_bitmaps; bool arch_has_per_cpu_cfg; }; -- cgit From 36805be775ae30e8c7c390f825e7a2528c260d29 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Oct 2022 16:44:44 +0300 Subject: gpio: reg: Add missing header(s) Do not imply that some of the generic headers may be always included. Instead, include explicitly what we are direct user of. Signed-off-by: Andy Shevchenko --- include/linux/gpio/gpio-reg.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-reg.h b/include/linux/gpio/gpio-reg.h index 39b888c40b39..3913b6660ed1 100644 --- a/include/linux/gpio/gpio-reg.h +++ b/include/linux/gpio/gpio-reg.h @@ -2,9 +2,13 @@ #ifndef GPIO_REG_H #define GPIO_REG_H +#include + struct device; struct irq_domain; +struct gpio_chip; + struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg, int base, int num, const char *label, u32 direction, u32 def_out, const char *const *names, struct irq_domain *irqdom, const int *irqs); -- cgit From 08a149c40bdbb9cd08fd0d39c6976d713a187300 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Oct 2022 12:53:44 +0300 Subject: gpiolib: Clean up headers There is a few things done: - include only the headers we are direct user of - when pointer is in use, provide a forward declaration - add missing headers - group generic headers and subsystem headers - sort each group alphabetically While at it, fix some awkward indentations. Signed-off-by: Andy Shevchenko --- include/linux/gpio.h | 2 +- include/linux/gpio/driver.h | 2 +- include/linux/gpio/machine.h | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index a370387fa406..346f60bbab30 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -98,9 +98,9 @@ int devm_gpio_request_one(struct device *dev, unsigned gpio, #else /* ! CONFIG_GPIOLIB */ +#include #include #include -#include struct device; struct gpio_chip; diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 6aeea1071b1b..2a44600b01f7 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -7,8 +7,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index 0b619eb7ae83..44e5f162973e 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -3,7 +3,6 @@ #define __LINUX_GPIO_MACHINE_H #include -#include enum gpio_lookup_flags { GPIO_ACTIVE_HIGH = (0 << 0), -- cgit From a5ef058dc4d9a3e60d1808a0700e18e0e37e408e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:51 +0200 Subject: net: introduce and use custom sockopt socket flag We will soon introduce custom setsockopt for UDP sockets, too. Instead of doing even more complex arbitrary checks inside sock_use_custom_sol_socket(), add a new socket flag and set it for the relevant socket types (currently only MPTCP). Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8..59350fd85823 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_CUSTOM_SOCKOPT 5 #ifndef ARCH_HAS_SOCKET_TYPES /** -- cgit From 8a3854c7b8e4532063b14bed34115079b7d0cb36 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:52 +0200 Subject: udp: track the forward memory release threshold in an hot cacheline When the receiver process and the BH runs on different cores, udp_rmem_release() experience a cache miss while accessing sk_rcvbuf, as the latter shares the same cacheline with sk_forward_alloc, written by the BH. With this patch, UDP tracks the rcvbuf value and its update via custom SOL_SOCKET socket options, and copies the forward memory threshold value used by udp_rmem_release() in a different cacheline, already accessed by the above function and uncontended. Since the UDP socket init operation grown a bit, factor out the common code between v4 and v6 in a shared helper. Overall the above give a 10% peek throughput increase under UDP flood. Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index e96da4157d04..5cdba00a904a 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -87,6 +87,9 @@ struct udp_sock { /* This field is dirtied by udp_recvmsg() */ int forward_deficit; + + /* This fields follows rcvbuf value, and is touched by udp_recvmsg */ + int forward_threshold; }; #define UDP_MAX_SEGMENTS (1 << 6UL) -- cgit From 4727bab4e9bbeafeff6acdfcb077a7a548cbde30 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 21 Oct 2022 10:58:22 +0800 Subject: net: skb: move skb_pp_recycle() to skbuff.c skb_pp_recycle() is only used by skb_free_head() in skbuff.c, so move it to skbuff.c. Signed-off-by: Yunsheng Lin Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7be5bb4c94b6..59c9fd55699d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5050,12 +5050,5 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) } #endif -static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) -{ - if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) - return false; - return page_pool_return_skb_page(virt_to_page(data)); -} - #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ -- cgit From 88a947215c29aa49307c8cd0638ba54e4cf07391 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Oct 2022 22:00:15 +0300 Subject: spi: pxa2xx: Validate the correctness of the SSP type Currently we blindly apply the SSP type value from any source of the information. Increase robustness by validating the value before use. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221021190018.63646-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index a3fec2de512f..cd1973e6ac4b 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -229,6 +229,7 @@ enum pxa_ssp_type { LPSS_SPT_SSP, LPSS_BXT_SSP, LPSS_CNL_SSP, + SSP_MAX }; struct ssp_device { -- cgit From e5530adc17a79f2a93e8b35e0ce673fc33f5f663 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Oct 2022 12:53:44 +0300 Subject: pinctrl: Clean up headers There is a few things done: - include only the headers we are direct user of - when pointer is in use, provide a forward declaration - add missing headers - group generic headers and subsystem headers - sort each group alphabetically While at it, fix some awkward indentations. Signed-off-by: Andy Shevchenko --- include/linux/pinctrl/consumer.h | 31 ++++++++++++++----------------- include/linux/pinctrl/devinfo.h | 6 ++++-- include/linux/pinctrl/machine.h | 8 +++++--- include/linux/pinctrl/pinconf-generic.h | 23 +++++++++++++---------- include/linux/pinctrl/pinctrl.h | 18 +++++++++--------- include/linux/pinctrl/pinmux.h | 5 ++--- 6 files changed, 47 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 019fecd75d0c..4729d54e8995 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -12,14 +12,15 @@ #define __LINUX_PINCTRL_CONSUMER_H #include -#include -#include +#include + #include +struct device; + /* This struct is private to the core and should be regarded as a cookie */ struct pinctrl; struct pinctrl_state; -struct device; #ifdef CONFIG_PINCTRL @@ -33,9 +34,8 @@ extern int pinctrl_gpio_set_config(unsigned gpio, unsigned long config); extern struct pinctrl * __must_check pinctrl_get(struct device *dev); extern void pinctrl_put(struct pinctrl *p); -extern struct pinctrl_state * __must_check pinctrl_lookup_state( - struct pinctrl *p, - const char *name); +extern struct pinctrl_state * __must_check pinctrl_lookup_state(struct pinctrl *p, + const char *name); extern int pinctrl_select_state(struct pinctrl *p, struct pinctrl_state *s); extern struct pinctrl * __must_check devm_pinctrl_get(struct device *dev); @@ -101,9 +101,8 @@ static inline void pinctrl_put(struct pinctrl *p) { } -static inline struct pinctrl_state * __must_check pinctrl_lookup_state( - struct pinctrl *p, - const char *name) +static inline struct pinctrl_state * __must_check pinctrl_lookup_state(struct pinctrl *p, + const char *name) { return NULL; } @@ -145,8 +144,8 @@ static inline int pinctrl_pm_select_idle_state(struct device *dev) #endif /* CONFIG_PINCTRL */ -static inline struct pinctrl * __must_check pinctrl_get_select( - struct device *dev, const char *name) +static inline struct pinctrl * __must_check pinctrl_get_select(struct device *dev, + const char *name) { struct pinctrl *p; struct pinctrl_state *s; @@ -171,14 +170,13 @@ static inline struct pinctrl * __must_check pinctrl_get_select( return p; } -static inline struct pinctrl * __must_check pinctrl_get_select_default( - struct device *dev) +static inline struct pinctrl * __must_check pinctrl_get_select_default(struct device *dev) { return pinctrl_get_select(dev, PINCTRL_STATE_DEFAULT); } -static inline struct pinctrl * __must_check devm_pinctrl_get_select( - struct device *dev, const char *name) +static inline struct pinctrl * __must_check devm_pinctrl_get_select(struct device *dev, + const char *name) { struct pinctrl *p; struct pinctrl_state *s; @@ -203,8 +201,7 @@ static inline struct pinctrl * __must_check devm_pinctrl_get_select( return p; } -static inline struct pinctrl * __must_check devm_pinctrl_get_select_default( - struct device *dev) +static inline struct pinctrl * __must_check devm_pinctrl_get_select_default(struct device *dev) { return devm_pinctrl_get_select(dev, PINCTRL_STATE_DEFAULT); } diff --git a/include/linux/pinctrl/devinfo.h b/include/linux/pinctrl/devinfo.h index a48ff69acddd..9e8b559e1253 100644 --- a/include/linux/pinctrl/devinfo.h +++ b/include/linux/pinctrl/devinfo.h @@ -14,11 +14,15 @@ #ifndef PINCTRL_DEVINFO_H #define PINCTRL_DEVINFO_H +struct device; + #ifdef CONFIG_PINCTRL /* The device core acts as a consumer toward pinctrl */ #include +struct pinctrl; + /** * struct dev_pin_info - pin state container for devices * @p: pinctrl handle for the containing device @@ -42,8 +46,6 @@ extern int pinctrl_init_done(struct device *dev); #else -struct device; - /* Stubs if we're not using pinctrl */ static inline int pinctrl_bind_pins(struct device *dev) diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h index e987dc9fd2af..0639b36f43c5 100644 --- a/include/linux/pinctrl/machine.h +++ b/include/linux/pinctrl/machine.h @@ -11,7 +11,7 @@ #ifndef __LINUX_PINCTRL_MACHINE_H #define __LINUX_PINCTRL_MACHINE_H -#include +#include /* ARRAY_SIZE() */ #include @@ -149,16 +149,18 @@ struct pinctrl_map { #define PIN_MAP_CONFIGS_GROUP_HOG_DEFAULT(dev, grp, cfgs) \ PIN_MAP_CONFIGS_GROUP(dev, PINCTRL_STATE_DEFAULT, dev, grp, cfgs) +struct pinctrl_map; + #ifdef CONFIG_PINCTRL extern int pinctrl_register_mappings(const struct pinctrl_map *map, - unsigned num_maps); + unsigned num_maps); extern void pinctrl_unregister_mappings(const struct pinctrl_map *map); extern void pinctrl_provide_dummies(void); #else static inline int pinctrl_register_mappings(const struct pinctrl_map *map, - unsigned num_maps) + unsigned num_maps) { return 0; } diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 2422211d6a5a..940fc4e9e17c 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -11,9 +11,12 @@ #ifndef __LINUX_PINCTRL_PINCONF_GENERIC_H #define __LINUX_PINCTRL_PINCONF_GENERIC_H -#include +#include + #include +struct device_node; + struct pinctrl_dev; struct pinctrl_map; @@ -196,25 +199,25 @@ int pinconf_generic_dt_node_to_map(struct pinctrl_dev *pctldev, void pinconf_generic_dt_free_map(struct pinctrl_dev *pctldev, struct pinctrl_map *map, unsigned num_maps); -static inline int pinconf_generic_dt_node_to_map_group( - struct pinctrl_dev *pctldev, struct device_node *np_config, - struct pinctrl_map **map, unsigned *num_maps) +static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctldev, + struct device_node *np_config, struct pinctrl_map **map, + unsigned *num_maps) { return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps, PIN_MAP_TYPE_CONFIGS_GROUP); } -static inline int pinconf_generic_dt_node_to_map_pin( - struct pinctrl_dev *pctldev, struct device_node *np_config, - struct pinctrl_map **map, unsigned *num_maps) +static inline int pinconf_generic_dt_node_to_map_pin(struct pinctrl_dev *pctldev, + struct device_node *np_config, struct pinctrl_map **map, + unsigned *num_maps) { return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps, PIN_MAP_TYPE_CONFIGS_PIN); } -static inline int pinconf_generic_dt_node_to_map_all( - struct pinctrl_dev *pctldev, struct device_node *np_config, - struct pinctrl_map **map, unsigned *num_maps) +static inline int pinconf_generic_dt_node_to_map_all(struct pinctrl_dev *pctldev, + struct device_node *np_config, struct pinctrl_map **map, + unsigned *num_maps) { /* * passing the type as PIN_MAP_TYPE_INVALID causes the underlying parser diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 487117ccb1bc..31fe992412f0 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -11,20 +11,20 @@ #ifndef __LINUX_PINCTRL_PINCTRL_H #define __LINUX_PINCTRL_PINCTRL_H -#include -#include -#include -#include -#include +#include struct device; +struct device_node; +struct gpio_chip; +struct module; +struct seq_file; + +struct pin_config_item; +struct pinconf_generic_params; +struct pinconf_ops; struct pinctrl_dev; struct pinctrl_map; struct pinmux_ops; -struct pinconf_ops; -struct pin_config_item; -struct gpio_chip; -struct device_node; /** * struct pingroup - provides information on pingroup diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 9a647fa5c8f1..a7e370965c53 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -11,11 +11,10 @@ #ifndef __LINUX_PINCTRL_PINMUX_H #define __LINUX_PINCTRL_PINMUX_H -#include -#include -#include +#include struct pinctrl_dev; +struct pinctrl_gpio_range; /** * struct pinmux_ops - pinmux operations, to be implemented by pin controller -- cgit From 2aa14b1ab2c41a4fe41efae80d58bb77da91f19f Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 17 Oct 2022 13:32:37 -0700 Subject: zstd: import usptream v1.5.2 Updates the kernel's zstd library to v1.5.2, the latest zstd release. The upstream tag it is updated to is `v1.5.2-kernel`, which contains several cherry-picked commits on top of the v1.5.2 release which are required for the kernel update. I will create this tag once the PR is ready to merge, until then reference the temporary upstream branch `v1.5.2-kernel-cherrypicks`. I plan to submit this patch as part of the v6.2 merge window. I've done basic build testing & testing on x86-64, i386, and aarch64. I'm merging these patches into my `zstd-next` branch, which is pulled into `linux-next` for further testing. I've benchmarked BtrFS with zstd compression on a x86-64 machine, and saw these results. Decompression speed is a small win across the board. The lower compression levels 1-4 see both compression speed and compression ratio wins. The higher compression levels see a small compression speed loss and about neutral ratio. I expect the lower compression levels to be used much more heavily than the high compression levels, so this should be a net win. Level CTime DTime Ratio 1 -2.95% -1.1% -0.7% 3 -3.5% -1.2% -0.5% 5 +3.7% -1.0% +0.0% 7 +3.2% -0.9% +0.0% 9 -4.3% -0.8% +0.1% Signed-off-by: Nick Terrell --- include/linux/zstd_lib.h | 475 +++++++++++++++++++++++++++++------------------ 1 file changed, 297 insertions(+), 178 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h index 6b91758b61af..79d55465d5c1 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -17,8 +17,16 @@ /* ===== ZSTDLIB_API : control library symbols visibility ===== */ -#define ZSTDLIB_VISIBILITY -#define ZSTDLIB_API ZSTDLIB_VISIBILITY +#ifndef ZSTDLIB_VISIBLE +# if (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) +# define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) +# else +# define ZSTDLIB_VISIBLE +# define ZSTDLIB_HIDDEN +# endif +#endif +#define ZSTDLIB_API ZSTDLIB_VISIBLE /* ***************************************************************************** @@ -56,8 +64,8 @@ /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 -#define ZSTD_VERSION_MINOR 4 -#define ZSTD_VERSION_RELEASE 10 +#define ZSTD_VERSION_MINOR 5 +#define ZSTD_VERSION_RELEASE 2 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : @@ -94,7 +102,6 @@ ZSTDLIB_API const char* ZSTD_versionString(void); #define ZSTD_BLOCKSIZE_MAX (1<= first frame size * @return : the compressed size of the first frame starting at `src`, @@ -165,8 +172,9 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ -ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ /* ************************************* @@ -219,9 +227,9 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, const void* src, size_t srcSize); -/* ************************************* -* Advanced compression API -***************************************/ +/* ******************************************* +* Advanced compression API (Requires v1.4.0+) +**********************************************/ /* API design : * Parameters are pushed one by one into an existing context, @@ -232,7 +240,7 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, * * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). * - * This API supercedes all other "advanced" API entry points in the experimental section. + * This API supersedes all other "advanced" API entry points in the experimental section. * In the future, we expect to remove from experimental API entry points which are redundant with this API. */ @@ -251,7 +259,6 @@ typedef enum { ZSTD_fast=1, Only the order (from fast to strong) is guaranteed */ } ZSTD_strategy; - typedef enum { /* compression parameters @@ -317,7 +324,6 @@ typedef enum { * The higher the value of selected strategy, the more complex it is, * resulting in stronger and slower compression. * Special: value 0 means "use default strategy". */ - /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. * This parameter is designed to improve compression ratio @@ -374,7 +380,7 @@ typedef enum { ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. * 0 means default, which is dynamically determined based on compression parameters. - * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest. * The minimum size is automatically and transparently enforced. */ ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. @@ -404,6 +410,8 @@ typedef enum { * ZSTD_c_stableOutBuffer * ZSTD_c_blockDelimiters * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -419,7 +427,10 @@ typedef enum { ZSTD_c_experimentalParam9=1006, ZSTD_c_experimentalParam10=1007, ZSTD_c_experimentalParam11=1008, - ZSTD_c_experimentalParam12=1009 + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, + ZSTD_c_experimentalParam15=1012 } ZSTD_cParameter; typedef struct { @@ -504,9 +515,9 @@ ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, const void* src, size_t srcSize); -/* ************************************* -* Advanced decompression API -***************************************/ +/* ********************************************* +* Advanced decompression API (Requires v1.4.0+) +************************************************/ /* The advanced API pushes parameters one by one into an existing DCtx context. * Parameters are sticky, and remain valid for all following frames @@ -668,7 +679,7 @@ typedef enum { : note : multithreaded compression will block to flush as much output as possible. */ } ZSTD_EndDirective; -/*! ZSTD_compressStream2() : +/*! ZSTD_compressStream2() : Requires v1.4.0+ * Behaves about the same as ZSTD_compressStream, with additional control on end directive. * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) @@ -714,11 +725,11 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output /* ***************************************************************************** - * This following is a legacy streaming API. + * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. - * Advanced parameters and dictionary compression can only be used through the - * new API. + * Streaming in combination with advanced parameters and dictionary compression + * can only be used through the new API. ******************************************************************************/ /*! @@ -796,7 +807,7 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output /*! ZSTD_compress_usingDict() : * Compression at an explicit compression level using a Dictionary. * A dictionary can be any arbitrary data segment (also called a prefix), - * or a buffer with specified information (see dictBuilder/zdict.h). + * or a buffer with specified information (see zdict.h). * Note : This function loads the dictionary, resulting in significant startup delay. * It's intended for a dictionary used only once. * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ @@ -879,19 +890,25 @@ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, * Dictionary helper functions *******************************/ -/*! ZSTD_getDictID_fromDict() : +/*! ZSTD_getDictID_fromDict() : Requires v1.4.0+ * Provides the dictID stored within dictionary. * if @return == 0, the dictionary is not conformant with Zstandard specification. * It can still be loaded, but as a content-only dictionary. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); -/*! ZSTD_getDictID_fromDDict() : +/*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+ + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); + +/*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+ * Provides the dictID of the dictionary loaded into `ddict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); -/*! ZSTD_getDictID_fromFrame() : +/*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+ * Provides the dictID required to decompressed the frame stored within `src`. * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : @@ -905,16 +922,16 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); /* ***************************************************************************** - * Advanced dictionary and prefix API + * Advanced dictionary and prefix API (Requires v1.4.0+) * * This API allows dictionaries to be used with ZSTD_compress2(), - * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and * only reset with the context is reset with ZSTD_reset_parameters or * ZSTD_reset_session_and_parameters. Prefixes are single-use. ******************************************************************************/ -/*! ZSTD_CCtx_loadDictionary() : +/*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+ * Create an internal CDict from `dict` buffer. * Decompression will have to use same dictionary. * @result : 0, or an error code (which can be tested with ZSTD_isError()). @@ -933,7 +950,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * to precisely select how dictionary content must be interpreted. */ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); -/*! ZSTD_CCtx_refCDict() : +/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used for all next compressed frames. * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. @@ -947,7 +964,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, s * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); -/*! ZSTD_CCtx_refPrefix() : +/*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) for next compressed frame. * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). * Decompression will need same prefix to properly regenerate data. @@ -968,7 +985,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); -/*! ZSTD_DCtx_loadDictionary() : +/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ * Create an internal DDict from dict buffer, * to be used to decompress next frames. * The dictionary remains valid for all future frames, until explicitly invalidated. @@ -985,7 +1002,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, */ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); -/*! ZSTD_DCtx_refDDict() : +/*! ZSTD_DCtx_refDDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used to decompress next frames. * The dictionary remains active for decompression of future frames using same DCtx. * @@ -1003,7 +1020,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s */ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); -/*! ZSTD_DCtx_refPrefix() : +/*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) to decompress next frame. * This is the reverse operation of ZSTD_CCtx_refPrefix(), * and must use the same prefix as the one used during compression. @@ -1024,7 +1041,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, /* === Memory management === */ -/*! ZSTD_sizeof_*() : +/*! ZSTD_sizeof_*() : Requires v1.4.0+ * These functions give the _current_ memory usage of selected object. * Note that object memory usage can evolve (increase or decrease) over time. */ ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); @@ -1049,6 +1066,29 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY +/* This can be overridden externally to hide static symbols. */ +#ifndef ZSTDLIB_STATIC_API +#define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE +#endif + +/* Deprecation warnings : + * Should these warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. + * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. + */ +#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +#else +# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +# elif (__GNUC__ >= 3) +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +# else +# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +# endif +#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ + /* ************************************************************************************** * experimental API (static linking only) **************************************************************************************** @@ -1111,9 +1151,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX -/* internal */ -#define ZSTD_HASHLOG3_MAX 17 - /* --- Advanced types --- */ @@ -1255,6 +1292,15 @@ typedef enum { ZSTD_lcm_uncompressed = 2 /*< Always emit uncompressed literals. */ } ZSTD_literalCompressionMode_e; +typedef enum { + /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final + * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable + * or ZSTD_ps_disable allow for a force enable/disable the feature. + */ + ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ + ZSTD_ps_enable = 1, /* Force-enable the feature */ + ZSTD_ps_disable = 2 /* Do not use the feature */ +} ZSTD_paramSwitch_e; /* ************************************* * Frame size functions @@ -1281,7 +1327,7 @@ typedef enum { * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to * read each contained frame header. This is fast as most of the data is skipped, * however it does mean that all frame data must be present and valid. */ -ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); /*! ZSTD_decompressBound() : * `src` should point to the start of a series of ZSTD encoded and/or skippable frames @@ -1296,13 +1342,13 @@ ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: * upper-bound = # blocks * min(128 KB, Window_Size) */ -ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); /*! ZSTD_frameHeaderSize() : * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. * @return : size of the Frame Header, * or an error code (if srcSize is too small) */ -ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); typedef enum { ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ @@ -1325,7 +1371,7 @@ typedef enum { * @return : number of sequences generated */ -ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t outSeqsSize, const void* src, size_t srcSize); /*! ZSTD_mergeBlockDelimiters() : @@ -1339,7 +1385,7 @@ ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters * @return : number of sequences left after merging */ -ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); +ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); /*! ZSTD_compressSequences() : * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. @@ -1369,7 +1415,7 @@ ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t se * and cannot emit an RLE block that disagrees with the repcode history * @return : final compressed size or a ZSTD error. */ -ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize); @@ -1387,9 +1433,29 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size * * @return : number of bytes written or a ZSTD error. */ -ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, +ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned magicVariant); +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, + const void* src, size_t srcSize); + +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + */ +ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + + /* ************************************* * Memory management @@ -1418,10 +1484,10 @@ ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, * Note 2 : only single-threaded compression is supported. * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. */ -ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); -ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); -ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); /*! ZSTD_estimateCStreamSize() : * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. @@ -1436,20 +1502,20 @@ ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), * an internal ?Dict will be created, which additional size is not estimated here. * In this case, get total size by adding ZSTD_estimate?DictSize */ -ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); -ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); -ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); -ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); /*! ZSTD_estimate?DictSize() : * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. */ -ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); -ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); /*! ZSTD_initStatic*() : * Initialize an object using a pre-allocated fixed-size buffer. @@ -1472,20 +1538,20 @@ ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e * Limitation 2 : static cctx currently not compatible with multi-threading. * Limitation 3 : static dctx is incompatible with legacy support. */ -ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); -ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticCCtx() */ +ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticCCtx() */ -ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); -ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticDCtx() */ +ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticDCtx() */ -ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( +ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict( void* workspace, size_t workspaceSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams); -ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( +ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict( void* workspace, size_t workspaceSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, @@ -1504,44 +1570,44 @@ static __attribute__((__unused__)) ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ -ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams, ZSTD_customMem customMem); -/* ! Thread pool : - * These prototypes make it possible to share a thread pool among multiple compression contexts. - * This can limit resources for applications with multiple threads where each one uses - * a threaded compression mode (via ZSTD_c_nbWorkers parameter). - * ZSTD_createThreadPool creates a new thread pool with a given number of threads. - * Note that the lifetime of such pool must exist while being used. - * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value - * to use an internal thread pool). - * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. +/*! Thread pool : + * These prototypes make it possible to share a thread pool among multiple compression contexts. + * This can limit resources for applications with multiple threads where each one uses + * a threaded compression mode (via ZSTD_c_nbWorkers parameter). + * ZSTD_createThreadPool creates a new thread pool with a given number of threads. + * Note that the lifetime of such pool must exist while being used. + * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value + * to use an internal thread pool). + * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. */ typedef struct POOL_ctx_s ZSTD_threadPool; -ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); -ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ -ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); +ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); +ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); /* * This API is temporary and is expected to change or disappear in the future! */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced2( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, const ZSTD_CCtx_params* cctxParams, ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( +ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, @@ -1558,28 +1624,22 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( * As a consequence, `dictBuffer` **must** outlive CDict, * and its content must remain unmodified throughout the lifetime of CDict. * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); - -/*! ZSTD_getDictID_fromCDict() : - * Provides the dictID of the dictionary loaded into `cdict`. - * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. - * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ -ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); /*! ZSTD_getCParams() : * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. * `estimatedSrcSize` value is optional, select 0 if not known */ -ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); /*! ZSTD_getParams() : * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ -ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); /*! ZSTD_checkCParams() : * Ensure param values remain within authorized range. * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ -ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); +ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); /*! ZSTD_adjustCParams() : * optimize params for a given `srcSize` and `dictSize`. @@ -1587,23 +1647,25 @@ ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); * `dictSize` must be `0` when there is no dictionary. * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. * This function never fails (wide contract) */ -ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); /*! ZSTD_compress_advanced() : * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2") +size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict,size_t dictSize, ZSTD_parameters params); /*! ZSTD_compress_usingCDict_advanced() : - * Note : this function is now REDUNDANT. + * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation warning in some future version */ -ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, @@ -1613,18 +1675,18 @@ ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, /*! ZSTD_CCtx_loadDictionary_byReference() : * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); /*! ZSTD_CCtx_loadDictionary_advanced() : * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over * how to load the dictionary (by copy ? by reference ?) * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); /*! ZSTD_CCtx_refPrefix_advanced() : * Same as ZSTD_CCtx_refPrefix(), but gives finer control over * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); /* === experimental parameters === */ /* these parameters can be used with ZSTD_setParameter() @@ -1663,9 +1725,15 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre * See the comments on that enum for an explanation of the feature. */ #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 -/* Controls how the literals are compressed (default is auto). - * The value must be of type ZSTD_literalCompressionMode_e. - * See ZSTD_literalCompressionMode_t enum definition for details. +/* Controlled with ZSTD_paramSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never compress literals. + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals + * may still be emitted if huffman is not beneficial to use.) + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * literals compression based on the compression parameters - specifically, + * negative compression levels do not use literal compression. */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 @@ -1728,7 +1796,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre * * Note that this means that the CDict tables can no longer be copied into the * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be - * useable. The dictionary can only be attached or reloaded. + * usable. The dictionary can only be attached or reloaded. * * In general, you should expect compression to be faster--sometimes very much * so--and CDict creation to be slightly slower. Eventually, we will probably @@ -1817,12 +1885,55 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre */ #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 +/* ZSTD_c_useBlockSplitter + * Controlled with ZSTD_paramSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use block splitter. + * Set to ZSTD_ps_enable to always use block splitter. + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * block splitting based on the compression parameters. + */ +#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 + +/* ZSTD_c_useRowMatchFinder + * Controlled with ZSTD_paramSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use row-based matchfinder. + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * the row-based matchfinder based on support for SIMD instructions and the window log. + * Note that this only pertains to compression strategies: greedy, lazy, and lazy2 + */ +#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14 + +/* ZSTD_c_deterministicRefPrefix + * Default is 0 == disabled. Set to 1 to enable. + * + * Zstd produces different results for prefix compression when the prefix is + * directly adjacent to the data about to be compressed vs. when it isn't. + * This is because zstd detects that the two buffers are contiguous and it can + * use a more efficient match finding algorithm. However, this produces different + * results than when the two buffers are non-contiguous. This flag forces zstd + * to always load the prefix in non-contiguous mode, even if it happens to be + * adjacent to the data, to guarantee determinism. + * + * If you really care about determinism when using a dictionary or prefix, + * like when doing delta compression, you should select this option. It comes + * at a speed penalty of about ~2.5% if the dictionary and data happened to be + * contiguous, and is free if they weren't contiguous. We don't expect that + * intentionally making the dictionary and data contiguous will be worth the + * cost to memcpy() the data. + */ +#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); /*! ZSTD_CCtx_params : @@ -1842,27 +1953,27 @@ ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() * for static allocation of CCtx for single-threaded compression. */ -ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); -ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ +ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ /*! ZSTD_CCtxParams_reset() : * Reset params to default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); /*! ZSTD_CCtxParams_init() : * Initializes the compression parameters of cctxParams according to * compression level. All other parameters are reset to their default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); /*! ZSTD_CCtxParams_init_advanced() : * Initializes the compression and frame parameters of cctxParams according to * params. All other parameters are reset to their default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); -/*! ZSTD_CCtxParams_setParameter() : +/*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+ * Similar to ZSTD_CCtx_setParameter. * Set one compression parameter, selected by enum ZSTD_cParameter. * Parameters must be applied to a ZSTD_CCtx using @@ -1870,14 +1981,14 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, Z * @result : a code representing success or failure (which can be tested with * ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); /*! ZSTD_CCtxParams_getParameter() : * Similar to ZSTD_CCtx_getParameter. * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. * @result : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); /*! ZSTD_CCtx_setParametersUsingCCtxParams() : * Apply a set of ZSTD_CCtx_params to the compression context. @@ -1886,7 +1997,7 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, * if nbWorkers>=1, new parameters will be picked up at next job, * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). */ -ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); /*! ZSTD_compressStream2_simpleArgs() : @@ -1895,7 +2006,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( * This variant might be helpful for binders from dynamic languages * which have troubles handling structures containing memory pointers. */ -ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( +ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs ( ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos, @@ -1911,33 +2022,33 @@ ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. * Note 3 : Skippable Frame Identifiers are considered valid. */ -ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); +ZSTDLIB_STATIC_API unsigned ZSTD_isFrame(const void* buffer, size_t size); /*! ZSTD_createDDict_byReference() : * Create a digested dictionary, ready to start decompression operation without startup delay. * Dictionary content is referenced, and therefore stays in dictBuffer. * It is important that dictBuffer outlives DDict, * it must remain read accessible throughout the lifetime of DDict */ -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); /*! ZSTD_DCtx_loadDictionary_byReference() : * Same as ZSTD_DCtx_loadDictionary(), * but references `dict` content instead of copying it into `dctx`. * This saves memory if `dict` remains around., * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ -ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); /*! ZSTD_DCtx_loadDictionary_advanced() : * Same as ZSTD_DCtx_loadDictionary(), * but gives direct control over * how to load the dictionary (by copy ? by reference ?) * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ -ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); /*! ZSTD_DCtx_refPrefix_advanced() : * Same as ZSTD_DCtx_refPrefix(), but gives finer control over * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); /*! ZSTD_DCtx_setMaxWindowSize() : * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. @@ -1946,14 +2057,14 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* pre * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); /*! ZSTD_DCtx_getParameter() : * Get the requested decompression parameter value, selected by enum ZSTD_dParameter, * and store it into int* value. * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); /* ZSTD_d_format * experimental parameter, @@ -2028,11 +2139,13 @@ ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). * Instruct the decoder context about what kind of data to decode next. * This instruction is mandatory to decode data without a fully-formed header, * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); +ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); /*! ZSTD_decompressStream_simpleArgs() : * Same as ZSTD_decompressStream(), @@ -2040,7 +2153,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); * This can be helpful for binders from dynamic languages * which have troubles handling structures containing memory pointers. */ -ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( +ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos); @@ -2056,7 +2169,7 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( /*===== Advanced Streaming compression functions =====*/ /*! ZSTD_initCStream_srcSize() : - * This function is deprecated, and equivalent to: + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); @@ -2065,15 +2178,15 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( * pledgedSrcSize must be correct. If it is not known at init time, use * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, * "0" also disables frame content size field. It may be enabled in the future. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); /*! ZSTD_initCStream_usingDict() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); @@ -2082,15 +2195,15 @@ ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, * dict == NULL or dictSize < 8, in which case no dict is used. * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /*! ZSTD_initCStream_advanced() : - * This function is deprecated, and is approximately equivalent to: + * This function is DEPRECATED, and is approximately equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * // Pseudocode: Set each zstd parameter and leave the rest as-is. * for ((param, value) : params) { @@ -2102,23 +2215,24 @@ ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. * pledgedSrcSize must be correct. * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*! ZSTD_initCStream_usingCDict() : - * This function is deprecated, and equivalent to: + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, cdict); * * note : cdict will just be referenced, and must outlive compression session - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); /*! ZSTD_initCStream_usingCDict_advanced() : * This function is DEPRECATED, and is approximately equivalent to: @@ -2133,18 +2247,21 @@ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDi * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. * pledgedSrcSize must be correct. If srcSize is not known at init time, use * value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize); /*! ZSTD_resetCStream() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * Note: ZSTD_resetCStream() interprets pledgedSrcSize == 0 as ZSTD_CONTENTSIZE_UNKNOWN, but + * ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTENTSIZE_UNKNOWN must be + * explicitly specified. * * start a new frame, using same parameters from previous frame. * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. @@ -2154,9 +2271,10 @@ ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. * @return : 0, or an error code (which can be tested using ZSTD_isError()) - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); typedef struct { @@ -2174,7 +2292,7 @@ typedef struct { * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. * Aggregates progression inside active worker threads. */ -ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); /*! ZSTD_toFlushNow() : * Tell how many bytes are ready to be flushed immediately. @@ -2189,7 +2307,7 @@ ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx * therefore flush speed is limited by production speed of oldest job * irrespective of the speed of concurrent (and newer) jobs. */ -ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); /*===== Advanced Streaming decompression functions =====*/ @@ -2203,7 +2321,7 @@ ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); * note: no dictionary will be used if dict == NULL or dictSize < 8 * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /*! * This function is deprecated, and is equivalent to: @@ -2214,7 +2332,7 @@ ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dic * note : ddict is referenced, it must outlive decompression session * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); +ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); /*! * This function is deprecated, and is equivalent to: @@ -2224,7 +2342,7 @@ ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDi * re-use decompression parameters from previous init; saves dictionary loading * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); +ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); /* ******************************************************************* @@ -2243,8 +2361,7 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); ZSTD_CCtx object can be re-used multiple times within successive compression operations. Start by initializing a context. - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, - or ZSTD_compressBegin_advanced(), for finer parameter control. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() Then, consume your input using ZSTD_compressContinue(). @@ -2267,17 +2384,19 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); */ /*===== Buffer-less streaming compression functions =====*/ -ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ - -ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - - +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ /* Buffer-less streaming decompression (synchronous mode) @@ -2368,24 +2487,24 @@ typedef struct { * @return : 0, `zfhPtr` is correctly filled, * >0, `srcSize` is too small, value is wanted `srcSize` amount, * or an error code, which can be tested using ZSTD_isError() */ -ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ /*! ZSTD_getFrameHeader_advanced() : * same as ZSTD_getFrameHeader(), * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ -ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); -ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); -ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); -ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); -ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); -ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); /* misc */ -ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; -ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); @@ -2422,10 +2541,10 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); */ /*===== Raw zstd block functions =====*/ -ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); -ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ +ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ -- cgit From 398900498485fa9465386454681763d81efaad25 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 21 Oct 2022 16:09:59 +0100 Subject: net: sfp: provide a definition for the power level select bit Provide a named definition for the power level select bit in the extended status register, rather than using BIT(0) in the code. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index d1f343853b6c..01ae9f1dd2ad 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -489,6 +489,8 @@ enum { SFP_WARN1_RXPWR_LOW = BIT(6), SFP_EXT_STATUS = 0x76, + SFP_EXT_STATUS_PWRLVL_SELECT = BIT(0), + SFP_VSL = 0x78, SFP_PAGE = 0x7f, }; -- cgit From 848dba781f1951636c966c9f3a6a41a5b2f8b572 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 24 Oct 2022 14:39:33 +0200 Subject: container_of: remove container_of_safe() It came in from a staging driver that has been long removed from the tree, and there are no in-kernel users of the macro, and it's very dubious if anyone should ever use this thing, so just remove it entirely. Reviewed-by: Sakari Ailus Acked-by: Rafael J. Wysocki Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221024123933.3331116-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/container_of.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/container_of.h b/include/linux/container_of.h index 2f4944b791b8..a6f242137b11 100644 --- a/include/linux/container_of.h +++ b/include/linux/container_of.h @@ -21,20 +21,4 @@ "pointer type mismatch in container_of()"); \ ((type *)(__mptr - offsetof(type, member))); }) -/** - * container_of_safe - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged. - */ -#define container_of_safe(ptr, type, member) ({ \ - void *__mptr = (void *)(ptr); \ - static_assert(__same_type(*(ptr), ((type *)0)->member) || \ - __same_type(*(ptr), void), \ - "pointer type mismatch in container_of_safe()"); \ - IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) : \ - ((type *)(__mptr - offsetof(type, member))); }) - #endif /* _LINUX_CONTAINER_OF_H */ -- cgit From 7376e561fd2e017e9a53f975209777234b8b434e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 24 Oct 2022 14:16:27 +0300 Subject: linux/container_of.h: Warn about loss of constness container_of() casts the original type to another which leads to the loss of the const qualifier if it is not specified in the caller-provided type. This easily leads to container_of() returning a non-const pointer to a const struct which the C compiler does not warn about. Acked-by: Andy Shevchenko Signed-off-by: Sakari Ailus Link: https://lore.kernel.org/r/20221024111627.75183-1-sakari.ailus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/container_of.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/container_of.h b/include/linux/container_of.h index a6f242137b11..2008e9f4058c 100644 --- a/include/linux/container_of.h +++ b/include/linux/container_of.h @@ -13,6 +13,7 @@ * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * + * WARNING: any const qualifier of @ptr is lost. */ #define container_of(ptr, type, member) ({ \ void *__mptr = (void *)(ptr); \ -- cgit From c408b3d1d9bbc7de5fb0304fea424ef2539da616 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 17 Oct 2022 15:33:01 +0530 Subject: thermal: Validate new state in cur_state_store() In cur_state_store(), the new state of the cooling device is received from user-space and is not validated by the thermal core but the same is left for the individual drivers to take care of. Apart from duplicating the code it leaves possibility for introducing bugs where a driver may not do it right. Lets make the thermal core check the new state itself and store the max value in the cooling device structure. Link: https://lore.kernel.org/all/Y0ltRJRjO7AkawvE@kili/ Reported-by: Dan Carpenter Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 9ecc128944a1..5e093602e8fc 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -100,6 +100,7 @@ struct thermal_cooling_device_ops { struct thermal_cooling_device { int id; char *type; + unsigned long max_state; struct device device; struct device_node *np; void *devdata; -- cgit From 73feb8d5fa3b755bb51077c0aabfb6aa556fd498 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Oct 2022 15:41:41 +0200 Subject: kallsyms: Make module_kallsyms_on_each_symbol generally available Making module_kallsyms_on_each_symbol generally available, so it can be used outside CONFIG_LIVEPATCH option in following changes. Rather than adding another ifdef option let's make the function generally available (when CONFIG_KALLSYMS and CONFIG_MODULES options are defined). Cc: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20221025134148.3300700-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ec61fb53979a..35876e89eb93 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -879,8 +879,17 @@ static inline bool module_sig_ok(struct module *module) } #endif /* CONFIG_MODULE_SIG */ +#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); +#else +static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */ -- cgit From a55b70f1273a54b33482db8b2568da435fefd6c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Oct 2022 08:59:16 -0700 Subject: block: remove bio_start_io_acct_time bio_start_io_acct_time is not actually used anywhere, so remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221025155916.270303-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50e358a19d98..57ed49f20d2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1458,7 +1458,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); -void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); -- cgit From b5f0de6df6dce8d641ef58ef7012f3304dffb9a1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 02:56:03 -0700 Subject: net: dev: Convert sa_data to flexible array in struct sockaddr One of the worst offenders of "fake flexible arrays" is struct sockaddr, as it is the classic example of why GCC and Clang have been traditionally forced to treat all trailing arrays as fake flexible arrays: in the distant misty past, sa_data became too small, and code started just treating it as a flexible array, even though it was fixed-size. The special case by the compiler is specifically that sizeof(sa->sa_data) and FORTIFY_SOURCE (which uses __builtin_object_size(sa->sa_data, 1)) do not agree (14 and -1 respectively), which makes FORTIFY_SOURCE treat it as a flexible array. However, the coming -fstrict-flex-arrays compiler flag will remove these special cases so that FORTIFY_SOURCE can gain coverage over all the trailing arrays in the kernel that are _not_ supposed to be treated as a flexible array. To deal with this change, convert sa_data to a true flexible array. To keep the structure size the same, move sa_data into a union with a newly introduced sa_data_min with the original size. The result is that FORTIFY_SOURCE can continue to have no idea how large sa_data may actually be, but anything using sizeof(sa->sa_data) must switch to sizeof(sa->sa_data_min). Cc: Jens Axboe Cc: Pavel Begunkov Cc: David Ahern Cc: Dylan Yudaken Cc: Yajun Deng Cc: Petr Machata Cc: Hangbin Liu Cc: Leon Romanovsky Cc: syzbot Cc: Willem de Bruijn Cc: Pablo Neira Ayuso Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221018095503.never.671-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index de3701a2a212..13c3a237b9c9 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -33,7 +33,10 @@ typedef __kernel_sa_family_t sa_family_t; struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - char sa_data[14]; /* 14 bytes of protocol address */ + union { + char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ + DECLARE_FLEX_ARRAY(char, sa_data); + }; }; struct linger { -- cgit From b179c98f76978a0fae072c2b2dad8e143218afd8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 25 Oct 2022 12:17:53 -0700 Subject: block: Remove request.write_hint Commit c75e707fe1aa ("block: remove the per-bio/request write hint") removed all code that uses the struct request write_hint member. Hence also remove 'write_hint' itself. Reviewed-by: Ming Lei Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20221025191755.1711437-2-bvanassche@acm.org Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ba18e9bdb799..569053ed959d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,7 +140,6 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif - unsigned short write_hint; unsigned short ioprio; enum mq_rq_state state; -- cgit From 2b5f9dad32ed19e8db3b0f10a84aa824a219803b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 20 Sep 2022 17:31:19 -0700 Subject: fs/exec: switch timens when a task gets a new mm Changing a time namespace requires remapping a vvar page, so we don't want to allow doing that if any other tasks can use the same mm. Currently, we install a time namespace when a task is created with a new vm. exec() is another case when a task gets a new mm and so it can switch a time namespace safely, but it isn't handled now. One more issue of the current interface is that clone() with CLONE_VM isn't allowed if the current task has unshared a time namespace (timens_for_children doesn't match the current timens). Both these issues make some inconvenience for users. For example, Alexey and Florian reported that posix_spawn() uses vfork+exec and this pattern doesn't work with time namespaces due to the both described issues. LXC needed to workaround the exec() issue by calling setns. In the commit 133e2d3e81de5 ("fs/exec: allow to unshare a time namespace on vfork+exec"), we tried to fix these issues with minimal impact on UAPI. But it adds extra complexity and some undesirable side effects. Eric suggested fixing the issues properly because here are all the reasons to suppose that there are no users that depend on the old behavior. Cc: Alexey Izbyshev Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Florian Weimer Cc: Kees Cook Suggested-by: "Eric W. Biederman" Origin-author: "Eric W. Biederman" Signed-off-by: Andrei Vagin Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220921003120.209637-1-avagin@google.com --- include/linux/nsproxy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index cdb171efc7cb..fee881cded01 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -94,6 +94,7 @@ static inline struct cred *nsset_cred(struct nsset *set) int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); +int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); -- cgit From 3437d67a8d92a87aaba9107ceff6b4cc33b7cb94 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 25 Oct 2022 20:57:44 +0800 Subject: lsm: remove obsoleted comments for security hooks Remove the following obsoleted comments for security hooks: 1. sb_copy_data, the hook function has been removed since commit 5b4002391153 ("LSM: turn sb_eat_lsm_opts() into a method"). 2. sb_parse_opts_str, the hook function has been removed since commit 757cbe597fe8 ("LSM: new method: ->sb_add_mnt_opt()"). They are obsoleted comments, so remove them. Signed-off-by: Gaosheng Cui Reviewed-by: Casey Schaufler [PM: subj line tweaks] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 4ec80b96c22e..9a7e69aff3d1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -136,15 +136,6 @@ * @flags contains the mount flags. * @data contains the filesystem-specific data. * Return 0 if permission is granted. - * @sb_copy_data: - * Allow mount option data to be copied prior to parsing by the filesystem, - * so that the security module can extract security-specific mount - * options cleanly (a filesystem may modify the data e.g. with strsep()). - * This also allows the original mount data to be stripped of security- - * specific options to avoid having to make filesystems aware of them. - * @orig the original mount data copied from userspace. - * @copy copied data which will be passed to the security module. - * Returns 0 if the copy was successful. * @sb_mnt_opts_compat: * Determine if the new mount options in @mnt_opts are allowed given * the existing mounted filesystem at @sb. @@ -180,10 +171,6 @@ * Copy all security options from a given superblock to another * @oldsb old superblock which contain information to clone * @newsb new superblock which needs filled in - * @sb_parse_opts_str: - * Parse a string of security data filling in the opts structure - * @options string containing all mount options known by the LSM - * @opts binary data structure usable by the LSM * @move_mount: * Check permission before a mount is moved. * @from_path indicates the mount that is going to be moved. -- cgit From 271de525e1d7f564e88a9d212c50998b49a54476 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:16 -0700 Subject: bpf: Remove prog->active check for bpf_lsm and bpf_iter The commit 64696c40d03c ("bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline") removed prog->active check for struct_ops prog. The bpf_lsm and bpf_iter is also using trampoline. Like struct_ops, the bpf_lsm and bpf_iter have fixed hooks for the prog to attach. The kernel does not call the same hook in a recursive way. This patch also removes the prog->active check for bpf_lsm and bpf_iter. A later patch has a test to reproduce the recursion issue for a sleepable bpf_lsm program. This patch appends the '_recur' naming to the existing enter and exit functions that track the prog->active counter. New __bpf_prog_{enter,exit}[_sleepable] function are added to skip the prog->active tracking. The '_struct_ops' version is also removed. It also moves the decision on picking the enter and exit function to the new bpf_trampoline_{enter,exit}(). It returns the '_recur' ones for all tracing progs to use. For bpf_lsm, bpf_iter, struct_ops (no prog->active tracking after 64696c40d03c), and bpf_lsm_cgroup (no prog->active tracking after 69fd337a975c7), it will return the functions that don't track the prog->active. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 24 ++++++++++-------------- include/linux/bpf_verifier.h | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..1279e699dc98 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -854,22 +854,18 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call); -/* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); +typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); +bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); +bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); struct bpf_ksym { unsigned long start; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9e1e6965f407..1a32baa78ce2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -642,10 +642,23 @@ static inline u32 type_flag(u32 type) } /* only use after check_attach_btf_id() */ -static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_EXT ? prog->aux->dst_prog->type : prog->type; } +static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) +{ + switch (resolve_prog_type(prog)) { + case BPF_PROG_TYPE_TRACING: + return prog->expected_attach_type != BPF_TRACE_ITER; + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_LSM: + return false; + default: + return true; + } +} + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit From 0593dd34e53489557569d5e6d27371b49aa9b41f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:17 -0700 Subject: bpf: Append _recur naming to the bpf_task_storage helper proto This patch adds the "_recur" naming to the bpf_task_storage_{get,delete} proto. In a latter patch, they will only be used by the tracing programs that requires a deadlock detection because a tracing prog may use bpf_task_storage_{get,delete} recursively and cause a deadlock. Another following patch will add a different helper proto for the non tracing programs because they do not need the deadlock prevention. This patch does this rename to prepare for this future proto additions. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1279e699dc98..b04fe3f4342e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2519,8 +2519,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; -- cgit From 4279adb094a17132423f1271c3d11b593fc2327e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:20 -0700 Subject: bpf: Add new bpf_task_storage_get proto with no deadlock detection The bpf_lsm and bpf_iter do not recur that will cause a deadlock. The situation is similar to the bpf_pid_task_storage_lookup_elem() which is called from the syscall map_lookup_elem. It does not need deadlock detection. Otherwise, it will cause unnecessary failure when calling the bpf_task_storage_get() helper. This patch adds bpf_task_storage_get proto that does not do deadlock detection. It will be used by bpf_lsm and bpf_iter programs. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b04fe3f4342e..ef3f98afa45d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2520,6 +2520,7 @@ extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; -- cgit From 8a7dac37f27a3dfbd814bf29a73d6417db2c81d9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:22 -0700 Subject: bpf: Add new bpf_task_storage_delete proto with no deadlock detection The bpf_lsm and bpf_iter do not recur that will cause a deadlock. The situation is similar to the bpf_pid_task_storage_delete_elem() which is called from the syscall map_delete_elem. It does not need deadlock detection. Otherwise, it will cause unnecessary failure when calling the bpf_task_storage_delete() helper. This patch adds bpf_task_storage_delete proto that does not do deadlock detection. It will be used by bpf_lsm and bpf_iter program. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ef3f98afa45d..a5dbac8f5aba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2522,6 +2522,7 @@ extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; -- cgit From 5e67b8ef125bb6e83bf0f0442ad7ffc09e7956f9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:40 -0700 Subject: bpf: Make struct cgroup btf id global Make struct cgroup btf id global so later patch can reuse the same btf id. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042840.672602-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 2aea877d644f..c9744efd202f 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -265,5 +265,6 @@ MAX_BTF_TRACING_TYPE, }; extern u32 btf_tracing_ids[]; +extern u32 bpf_cgroup_btf_id[]; #endif -- cgit From c83597fa5dc6b322e9bdf929e5f4136a3f4aa4db Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:45 -0700 Subject: bpf: Refactor some inode/task/sk storage functions for reuse Refactor codes so that inode/task/sk storage implementation can maximally share the same code. I also added some comments in new function bpf_local_storage_unlink_nolock() to make codes easy to understand. There is no functionality change. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042845.672944-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 7ea18d4da84b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -116,21 +116,22 @@ static struct bpf_local_storage_cache name = { \ .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx); - /* Helper functions for bpf_local_storage */ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, @@ -141,10 +142,6 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem, bool use_trace_rcu); - void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, -- cgit From c4bcfb38a95edb1021a53f2d0356a78120ecfbe4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:50 -0700 Subject: bpf: Implement cgroup storage available to non-cgroup-attached bpf progs Similar to sk/inode/task storage, implement similar cgroup local storage. There already exists a local storage implementation for cgroup-attached bpf programs. See map type BPF_MAP_TYPE_CGROUP_STORAGE and helper bpf_get_local_storage(). But there are use cases such that non-cgroup attached bpf progs wants to access cgroup local storage data. For example, tc egress prog has access to sk and cgroup. It is possible to use sk local storage to emulate cgroup local storage by storing data in socket. But this is a waste as it could be lots of sockets belonging to a particular cgroup. Alternatively, a separate map can be created with cgroup id as the key. But this will introduce additional overhead to manipulate the new map. A cgroup local storage, similar to existing sk/inode/task storage, should help for this use case. The life-cycle of storage is managed with the life-cycle of the cgroup struct. i.e. the storage is destroyed along with the owning cgroup with a call to bpf_cgrp_storage_free() when cgroup itself is deleted. The userspace map operations can be done by using a cgroup fd as a key passed to the lookup, update and delete operations. Typically, the following code is used to get the current cgroup: struct task_struct *task = bpf_get_current_task_btf(); ... task->cgroups->dfl_cgrp ... and in structure task_struct definition: struct task_struct { .... struct css_set __rcu *cgroups; .... } With sleepable program, accessing task->cgroups is not protected by rcu_read_lock. So the current implementation only supports non-sleepable program and supporting sleepable program will be the next step together with adding rcu_read_lock protection for rcu tagged structures. Since map name BPF_MAP_TYPE_CGROUP_STORAGE has been used for old cgroup local storage support, the new map name BPF_MAP_TYPE_CGRP_STORAGE is used for cgroup storage available to non-cgroup-attached bpf programs. The old cgroup storage supports bpf_get_local_storage() helper to get the cgroup data. The new cgroup storage helper bpf_cgrp_storage_get() can provide similar functionality. While old cgroup storage pre-allocates storage memory, the new mechanism can also pre-allocate with a user space bpf_map_update_elem() call to avoid potential run-time memory allocation failure. Therefore, the new cgroup storage can provide all functionality w.r.t. the old one. So in uapi bpf.h, the old BPF_MAP_TYPE_CGROUP_STORAGE is alias to BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED to indicate the old cgroup storage can be deprecated since the new one can provide the same functionality. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042850.673791-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_types.h | 1 + include/linux/cgroup-defs.h | 4 ++++ 3 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a5dbac8f5aba..9fd68b0b3e9c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2041,6 +2041,7 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, @@ -2295,6 +2296,10 @@ static inline bool has_current_bpf_ctx(void) static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } + +static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2535,6 +2540,8 @@ extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2c6a4f2562a7..d4ee3ccd3753 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -86,6 +86,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_CGRP_STORAGE, cgrp_storage_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8f481d1b159a..c466fdc3a32a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -504,6 +504,10 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_local_storage __rcu *bpf_cgrp_storage; +#endif + /* All ancestors including self */ struct cgroup *ancestors[]; }; -- cgit From 9c4f28ddfb9c2e674fca24f68f12c1ffbfbffe41 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 26 Jun 2022 20:45:07 +0200 Subject: mnt_idmapping: add missing helpers Add missing helpers needed to convert all remaining places to the type safe idmapped mount helpers. After the conversion we will remove all the old helpers. Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- include/linux/mnt_idmapping.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index f6e5369d2928..cd1950ddc6a9 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -98,6 +98,26 @@ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid); } +static inline bool vfsuid_gt_kuid(vfsuid_t vfsuid, kuid_t kuid) +{ + return __vfsuid_val(vfsuid) > __kuid_val(kuid); +} + +static inline bool vfsgid_gt_kgid(vfsgid_t vfsgid, kgid_t kgid) +{ + return __vfsgid_val(vfsgid) > __kgid_val(kgid); +} + +static inline bool vfsuid_lt_kuid(vfsuid_t vfsuid, kuid_t kuid) +{ + return __vfsuid_val(vfsuid) < __kuid_val(kuid); +} + +static inline bool vfsgid_lt_kgid(vfsgid_t vfsgid, kgid_t kgid) +{ + return __vfsgid_val(vfsgid) < __kgid_val(kgid); +} + /* * vfs{g,u}ids are created from k{g,u}ids. * We don't allow them to be created from regular {u,g}id. @@ -333,6 +353,12 @@ static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns, return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid)); } +static inline bool vfsuid_has_mapping(struct user_namespace *userns, + vfsuid_t vfsuid) +{ + return from_kuid(userns, AS_KUIDT(vfsuid)) != (uid_t)-1; +} + /** * vfsuid_into_kuid - convert vfsuid into kuid * @vfsuid: the vfsuid to convert @@ -419,6 +445,12 @@ static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns, return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid)); } +static inline bool vfsgid_has_mapping(struct user_namespace *userns, + vfsgid_t vfsgid) +{ + return from_kgid(userns, AS_KGIDT(vfsgid)) != (gid_t)-1; +} + /** * vfsgid_into_kgid - convert vfsgid into kgid * @vfsgid: the vfsgid to convert -- cgit From eb7718cdb73c6b0c93002f8f73f4dd4701f8d2bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Jun 2022 12:53:54 +0200 Subject: fs: remove unused idmapping helpers Now that all places can deal with the new type safe helpers remove all of the old helpers. Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 34 ---------------------- include/linux/mnt_idmapping.h | 68 ------------------------------------------- 2 files changed, 102 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b39c5efca180..0951283f2515 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1612,23 +1612,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) inode->i_gid = make_kgid(i_user_ns(inode), gid); } -/** - * i_uid_into_mnt - map an inode's i_uid down into a mnt_userns - * @mnt_userns: user namespace of the mount the inode was found from - * @inode: inode to map - * - * Note, this will eventually be removed completely in favor of the type-safe - * i_uid_into_vfsuid(). - * - * Return: the inode's i_uid mapped down according to @mnt_userns. - * If the inode's i_uid has no mapping INVALID_UID is returned. - */ -static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, - const struct inode *inode) -{ - return AS_KUIDT(make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid)); -} - /** * i_uid_into_vfsuid - map an inode's i_uid down into a mnt_userns * @mnt_userns: user namespace of the mount the inode was found from @@ -1681,23 +1664,6 @@ static inline void i_uid_update(struct user_namespace *mnt_userns, attr->ia_vfsuid); } -/** - * i_gid_into_mnt - map an inode's i_gid down into a mnt_userns - * @mnt_userns: user namespace of the mount the inode was found from - * @inode: inode to map - * - * Note, this will eventually be removed completely in favor of the type-safe - * i_gid_into_vfsgid(). - * - * Return: the inode's i_gid mapped down according to @mnt_userns. - * If the inode's i_gid has no mapping INVALID_GID is returned. - */ -static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, - const struct inode *inode) -{ - return AS_KGIDT(make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid)); -} - /** * i_gid_into_vfsgid - map an inode's i_gid down into a mnt_userns * @mnt_userns: user namespace of the mount the inode was found from diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index cd1950ddc6a9..c8002294a72d 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -228,13 +228,6 @@ static inline vfsuid_t make_vfsuid(struct user_namespace *mnt_userns, return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); } -static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kuid_t kuid) -{ - return AS_KUIDT(make_vfsuid(mnt_userns, fs_userns, kuid)); -} - /** * make_vfsgid - map a filesystem kgid into a mnt_userns * @mnt_userns: the mount's idmapping @@ -273,13 +266,6 @@ static inline vfsgid_t make_vfsgid(struct user_namespace *mnt_userns, return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); } -static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kgid_t kgid) -{ - return AS_KGIDT(make_vfsgid(mnt_userns, fs_userns, kgid)); -} - /** * from_vfsuid - map a vfsuid into the filesystem idmapping * @mnt_userns: the mount's idmapping @@ -307,33 +293,6 @@ static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns, return make_kuid(fs_userns, uid); } -/** - * mapped_kuid_user - map a user kuid into a mnt_userns - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @kuid : kuid to be mapped - * - * Use the idmapping of @mnt_userns to remap a @kuid into @fs_userns. Use this - * function when preparing a @kuid to be written to disk or inode. - * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. - * If initial_idmapping() tells us that the filesystem is not mounted with an - * idmapping we know the value of @kuid won't change when calling - * make_kuid() so we can simply retrieve the value via KUIDT_INIT() - * directly. - * - * Return: @kuid mapped according to @mnt_userns. - * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is - * returned. - */ -static inline kuid_t mapped_kuid_user(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kuid_t kuid) -{ - return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid)); -} - /** * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem * @mnt_userns: the mount's idmapping @@ -399,33 +358,6 @@ static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns, return make_kgid(fs_userns, gid); } -/** - * mapped_kgid_user - map a user kgid into a mnt_userns - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @kgid : kgid to be mapped - * - * Use the idmapping of @mnt_userns to remap a @kgid into @fs_userns. Use this - * function when preparing a @kgid to be written to disk or inode. - * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. - * If initial_idmapping() tells us that the filesystem is not mounted with an - * idmapping we know the value of @kgid won't change when calling - * make_kgid() so we can simply retrieve the value via KGIDT_INIT() - * directly. - * - * Return: @kgid mapped according to @mnt_userns. - * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is - * returned. - */ -static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kgid_t kgid) -{ - return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid)); -} - /** * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem * @mnt_userns: the mount's idmapping -- cgit From e753df8fbca592d36f539ed950fcdf412166c549 Mon Sep 17 00:00:00 2001 From: Michal Jaron Date: Tue, 25 Oct 2022 09:12:52 -0700 Subject: ice: Add support Flex RXD Add new VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC flag, opcode VIRTCHNL_OP_GET_SUPPORTED_RXDIDS and add member rxdid in struct virtchnl_rxq_info to support AVF Flex RXD extension. Add support to allow VF to query flexible descriptor RXDIDs supported by DDP package and configure Rx queues with selected RXDID for IAVF. Add code to allow VIRTCHNL_OP_GET_SUPPORTED_RXDIDS message to be processed. Add necessary macros for registers. Signed-off-by: Leyi Rong Signed-off-by: Xu Ting Signed-off-by: Michal Jaron Signed-off-by: Mateusz Palczewski Tested-by: Maxime Coquelin Tested-by: Konrad Jankowski Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20221025161252.1952939-1-jacob.e.keller@intel.com Signed-off-by: Paolo Abeni --- include/linux/avf/virtchnl.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2ce27e8e4f19..d91af50ac58d 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -136,7 +136,8 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_CHANNELS = 31, VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, - /* opcode 34 - 44 are reserved */ + /* opcode 34 - 43 are reserved */ + VIRTCHNL_OP_GET_SUPPORTED_RXDIDS = 44, VIRTCHNL_OP_ADD_RSS_CFG = 45, VIRTCHNL_OP_DEL_RSS_CFG = 46, VIRTCHNL_OP_ADD_FDIR_FILTER = 47, @@ -263,6 +264,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM BIT(22) #define VIRTCHNL_VF_OFFLOAD_ADQ BIT(23) #define VIRTCHNL_VF_OFFLOAD_USO BIT(25) +#define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC BIT(26) #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) #define VIRTCHNL_VF_OFFLOAD_FDIR_PF BIT(28) @@ -318,7 +320,9 @@ struct virtchnl_rxq_info { u16 splithdr_enabled; /* deprecated with AVF 1.0 */ u32 databuffer_size; u32 max_pkt_size; - u32 pad1; + u8 pad0; + u8 rxdid; + u8 pad1[2]; u64 dma_ring_addr; enum virtchnl_rx_hsplit rx_split_pos; /* deprecated with AVF 1.0 */ u32 pad2; @@ -970,6 +974,10 @@ struct virtchnl_filter { VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); +struct virtchnl_supported_rxdids { + u64 supported_rxdids; +}; + /* VIRTCHNL_OP_EVENT * PF sends this message to inform the VF driver of events that may affect it. * No direct response is expected from the VF, though it may generate other @@ -1499,6 +1507,8 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_CLOUD_FILTER: valid_len = sizeof(struct virtchnl_filter); break; + case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS: + break; case VIRTCHNL_OP_ADD_RSS_CFG: case VIRTCHNL_OP_DEL_RSS_CFG: valid_len = sizeof(struct virtchnl_rss_cfg); -- cgit From 22168675bae75e158c459ef2dee3b6ebd52a80ed Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Fri, 30 Sep 2022 14:48:10 +0200 Subject: phy: dphy: add support to calculate the timing based on hs_clk_rate For MIPI-CSI sender use-case it is common to specify the allowed link-frequencies which should be used for the MIPI link and is half the hs-clock rate. This commit adds a helper to calculate the D-PHY timing based on the hs-clock rate so we don't need to calculate the timings within the driver. Signed-off-by: Marco Felsch Acked-by: Vinod Koul Signed-off-by: Sakari Ailus --- include/linux/phy/phy-mipi-dphy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy/phy-mipi-dphy.h b/include/linux/phy/phy-mipi-dphy.h index a877ffee845d..1ac128d78dfe 100644 --- a/include/linux/phy/phy-mipi-dphy.h +++ b/include/linux/phy/phy-mipi-dphy.h @@ -279,6 +279,9 @@ int phy_mipi_dphy_get_default_config(unsigned long pixel_clock, unsigned int bpp, unsigned int lanes, struct phy_configure_opts_mipi_dphy *cfg); +int phy_mipi_dphy_get_default_config_for_hsclk(unsigned long long hs_clk_rate, + unsigned int lanes, + struct phy_configure_opts_mipi_dphy *cfg); int phy_mipi_dphy_config_validate(struct phy_configure_opts_mipi_dphy *cfg); #endif /* __PHY_MIPI_DPHY_H_ */ -- cgit From bd27568117664b8b3e259721393df420ed51f57b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 8 Oct 2022 11:54:24 +0530 Subject: perf: Rewrite core context handling There have been various issues and limitations with the way perf uses (task) contexts to track events. Most notable is the single hardware PMU task context, which has resulted in a number of yucky things (both proposed and merged). Notably: - HW breakpoint PMU - ARM big.little PMU / Intel ADL PMU - Intel Branch Monitoring PMU - AMD IBS PMU - S390 cpum_cf PMU - PowerPC trace_imc PMU *Current design:* Currently we have a per task and per cpu perf_event_contexts: task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context ^ | ^ | ^ `---------------------------------' | `--> pmu ---' v ^ perf_event ------' Each task has an array of pointers to a perf_event_context. Each perf_event_context has a direct relation to a PMU and a group of events for that PMU. The task related perf_event_context's have a pointer back to that task. Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which includes a perf_event_context, which again has a direct relation to that PMU, and a group of events for that PMU. The perf_cpu_context also tracks which task context is currently associated with that CPU and includes a few other things like the hrtimer for rotation etc. Each perf_event is then associated with its PMU and one perf_event_context. *Proposed design:* New design proposed by this patch reduce to a single task context and a single CPU context but adds some intermediate data-structures: task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context ^ | ^ ^ `---------------------------' | | | | perf_cpu_pmu_context <--. | `----. ^ | | | | | | v v | | ,--> perf_event_pmu_context | | | | | | | v v | perf_event ---> pmu ----------------' With the new design, perf_event_context will hold all events for all pmus in the (respective pinned/flexible) rbtrees. This can be achieved by adding pmu to rbtree key: {cpu, pmu, cgroup, group_index} Each perf_event_context carries a list of perf_event_pmu_context which is used to hold per-pmu-per-context state. For example, it keeps track of currently active events for that pmu, a pmu specific task_ctx_data, a flag to tell whether rotation is required or not etc. Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu state like hrtimer details to drive the event rotation, a pointer to perf_event_pmu_context of currently running task and some other ancillary information. Each perf_event is associated to it's pmu, perf_event_context and perf_event_pmu_context. Further optimizations to current implementation are possible. For example, ctx_resched() can be optimized to reschedule only single pmu events. Much thanks to Ravi for picking this up and pushing it towards completion. Signed-off-by: Peter Zijlstra (Intel) Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com --- include/linux/perf/arm_pmu.h | 2 +- include/linux/perf_event.h | 125 ++++++++++++++++++++++++++++++++----------- include/linux/sched.h | 2 +- 3 files changed, 96 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0356cb6a215d..725968095ea9 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -100,7 +100,7 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); - int (*filter_match)(struct perf_event *event); + bool (*filter)(struct pmu *pmu, int cpu); int num_events; bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0031f7b4d9ab..c6a3bac76966 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -266,6 +266,7 @@ struct hw_perf_event { }; struct perf_event; +struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn @@ -308,7 +309,7 @@ struct pmu { int capabilities; int __percpu *pmu_disable_count; - struct perf_cpu_context __percpu *pmu_cpu_context; + struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -443,7 +444,7 @@ struct pmu { /* * context-switches callback */ - void (*sched_task) (struct perf_event_context *ctx, + void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* @@ -457,8 +458,8 @@ struct pmu { * implementation and Perf core context switch handling callbacks for usage * examples. */ - void (*swap_task_ctx) (struct perf_event_context *prev, - struct perf_event_context *next); + void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); /* optional */ /* @@ -522,9 +523,10 @@ struct pmu { /* optional */ /* - * Filter events for PMU-specific reasons. + * Skip programming this PMU on the given CPU. Typically needed for + * big.LITTLE things. */ - int (*filter_match) (struct perf_event *event); /* optional */ + bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. @@ -695,6 +697,11 @@ struct perf_event { int group_caps; struct perf_event *group_leader; + /* + * event->pmu will always point to pmu in which this event belongs. + * Whereas event->pmu_ctx->pmu may point to other pmu when group of + * different pmu events is created. + */ struct pmu *pmu; void *pmu_private; @@ -720,6 +727,12 @@ struct perf_event { struct hw_perf_event hw; struct perf_event_context *ctx; + /* + * event->pmu_ctx points to perf_event_pmu_context in which the event + * is added. This pmu_ctx can be of other pmu for sw event when that + * sw event is part of a group which also contains non-sw events. + */ + struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* @@ -812,19 +825,69 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; +/* + * ,-----------------------[1:n]----------------------. + * V V + * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event + * ^ ^ | | + * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-' + * + * + * struct perf_event_pmu_context lifetime is refcount based and RCU freed + * (similar to perf_event_context). Locking is as if it were a member of + * perf_event_context; specifically: + * + * modification, both: ctx->mutex && ctx->lock + * reading, either: ctx->mutex || ctx->lock + * + * There is one exception to this; namely put_pmu_ctx() isn't always called + * with ctx->mutex held; this means that as long as we can guarantee the epc + * has events the above rules hold. + * + * Specificially, sys_perf_event_open()'s group_leader case depends on + * ctx->mutex pinning the configuration. Since we hold a reference on + * group_leader (through the filedesc) it can't go away, therefore it's + * associated pmu_ctx must exist and cannot change due to ctx->mutex. + */ +struct perf_event_pmu_context { + struct pmu *pmu; + struct perf_event_context *ctx; + + struct list_head pmu_ctx_entry; + + struct list_head pinned_active; + struct list_head flexible_active; + + /* Used to avoid freeing per-cpu perf_event_pmu_context */ + unsigned int embedded : 1; + + unsigned int nr_events; + + atomic_t refcount; /* event <-> epc */ + struct rcu_head rcu_head; + + void *task_ctx_data; /* pmu specific data */ + /* + * Set when one or more (plausibly active) event can't be scheduled + * due to pmu overcommit or pmu constraints, except tolerant to + * events not necessary to be active due to scheduling constraints, + * such as cgroups. + */ + int rotate_necessary; +}; struct perf_event_groups { struct rb_root tree; u64 index; }; + /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { - struct pmu *pmu; /* * Protect the states of the events in the list, * nr_active, and the list: @@ -837,27 +900,21 @@ struct perf_event_context { */ struct mutex mutex; - struct list_head active_ctx_list; + struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; - struct list_head pinned_active; - struct list_head flexible_active; - int nr_events; - int nr_active; int nr_user; int is_active; + + int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; - /* - * Set when nr_events != nr_active, except tolerant to events not - * necessary to be active due to scheduling constraints, such as cgroups. - */ - int rotate_necessary; - refcount_t refcount; + + refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* @@ -878,7 +935,6 @@ struct perf_event_context { #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif - void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; /* @@ -896,12 +952,13 @@ struct perf_event_context { */ #define PERF_NR_CONTEXTS 4 -/** - * struct perf_cpu_context - per cpu event context structure - */ -struct perf_cpu_context { - struct perf_event_context ctx; - struct perf_event_context *task_ctx; +struct perf_cpu_pmu_context { + struct perf_event_pmu_context epc; + struct perf_event_pmu_context *task_epc; + + struct list_head sched_cb_entry; + int sched_cb_usage; + int active_oncpu; int exclusive; @@ -909,16 +966,20 @@ struct perf_cpu_context { struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; +}; + +/** + * struct perf_event_cpu_context - per cpu event context structure + */ +struct perf_cpu_context { + struct perf_event_context ctx; + struct perf_event_context *task_ctx; + int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; - struct list_head cgrp_cpuctx_entry; #endif - struct list_head sched_cb_entry; - int sched_cb_usage; - - int online; /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. @@ -982,6 +1043,8 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) #ifdef CONFIG_PERF_EVENTS +extern struct perf_event_context *perf_cpu_task_ctx(void); + extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, @@ -1187,7 +1250,7 @@ static inline int is_software_event(struct perf_event *event) */ static inline int in_software_context(struct perf_event *event) { - return event->ctx->pmu->task_ctx_nr == perf_sw_context; + return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..4e03f1dcbe52 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,7 +1243,7 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS - struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; + struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif -- cgit From a351b1f444187312bb42479cb26e82f26fc481d2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 28 Oct 2022 10:00:26 +0200 Subject: acl: make vfs_posix_acl_to_xattr() static After reworking posix acls this helper isn't used anywhere outside the core posix acl paths. Make it static. Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl_xattr.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 8daac3c5b583..54cd7a14330d 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -35,9 +35,6 @@ posix_acl_xattr_count(size_t size) #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, size_t size); -ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, - struct inode *inode, const struct posix_acl *acl, - void *buffer, size_t size); #else static inline struct posix_acl * posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, @@ -45,13 +42,6 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, { return ERR_PTR(-EOPNOTSUPP); } -static inline ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, - struct inode *inode, - const struct posix_acl *acl, - void *buffer, size_t size) -{ - return 0; -} #endif int posix_acl_to_xattr(struct user_namespace *user_ns, -- cgit From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41b1da621a45..ca7f05a130d2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -423,6 +423,7 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ -- cgit From 6ab428604f724cf217a47b7d3f3353aab815b40e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Oct 2022 10:45:44 -1000 Subject: cgroup: Implement DEBUG_CGROUP_REF It's really difficult to debug when cgroup or css refs leak. Let's add a debug option to force the refcnt function to not be inlined so that they can be kprobed for debugging. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 97 ++++++------------------------------------- include/linux/cgroup_refcnt.h | 90 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 85 deletions(-) create mode 100644 include/linux/cgroup_refcnt.h (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a88de5bdeaa9..5c9c07a44706 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -309,71 +309,23 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ +#ifdef CONFIG_DEBUG_CGROUP_REF +void css_get(struct cgroup_subsys_state *css); +void css_get_many(struct cgroup_subsys_state *css, unsigned int n); +bool css_tryget(struct cgroup_subsys_state *css); +bool css_tryget_online(struct cgroup_subsys_state *css); +void css_put(struct cgroup_subsys_state *css); +void css_put_many(struct cgroup_subsys_state *css, unsigned int n); +#else +#define CGROUP_REF_FN_ATTRS static inline +#include +#endif + static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } -/** - * css_get - obtain a reference on the specified css - * @css: target css - * - * The caller must already have a reference. - */ -static inline void css_get(struct cgroup_subsys_state *css) -{ - if (!(css->flags & CSS_NO_REF)) - percpu_ref_get(&css->refcnt); -} - -/** - * css_get_many - obtain references on the specified css - * @css: target css - * @n: number of references to get - * - * The caller must already have a reference. - */ -static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) -{ - if (!(css->flags & CSS_NO_REF)) - percpu_ref_get_many(&css->refcnt, n); -} - -/** - * css_tryget - try to obtain a reference on the specified css - * @css: target css - * - * Obtain a reference on @css unless it already has reached zero and is - * being released. This function doesn't care whether @css is on or - * offline. The caller naturally needs to ensure that @css is accessible - * but doesn't have to be holding a reference on it - IOW, RCU protected - * access is good enough for this function. Returns %true if a reference - * count was successfully obtained; %false otherwise. - */ -static inline bool css_tryget(struct cgroup_subsys_state *css) -{ - if (!(css->flags & CSS_NO_REF)) - return percpu_ref_tryget(&css->refcnt); - return true; -} - -/** - * css_tryget_online - try to obtain a reference on the specified css if online - * @css: target css - * - * Obtain a reference on @css if it's online. The caller naturally needs - * to ensure that @css is accessible but doesn't have to be holding a - * reference on it - IOW, RCU protected access is good enough for this - * function. Returns %true if a reference count was successfully obtained; - * %false otherwise. - */ -static inline bool css_tryget_online(struct cgroup_subsys_state *css) -{ - if (!(css->flags & CSS_NO_REF)) - return percpu_ref_tryget_live(&css->refcnt); - return true; -} - /** * css_is_dying - test whether the specified css is dying * @css: target css @@ -394,31 +346,6 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } -/** - * css_put - put a css reference - * @css: target css - * - * Put a reference obtained via css_get() and css_tryget_online(). - */ -static inline void css_put(struct cgroup_subsys_state *css) -{ - if (!(css->flags & CSS_NO_REF)) - percpu_ref_put(&css->refcnt); -} - -/** - * css_put_many - put css references - * @css: target css - * @n: number of references to put - * - * Put references obtained via css_get() and css_tryget_online(). - */ -static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) -{ - if (!(css->flags & CSS_NO_REF)) - percpu_ref_put_many(&css->refcnt, n); -} - static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); diff --git a/include/linux/cgroup_refcnt.h b/include/linux/cgroup_refcnt.h new file mode 100644 index 000000000000..1aa89295dac0 --- /dev/null +++ b/include/linux/cgroup_refcnt.h @@ -0,0 +1,90 @@ +/** + * css_get - obtain a reference on the specified css + * @css: target css + * + * The caller must already have a reference. + */ +CGROUP_REF_FN_ATTRS +void css_get(struct cgroup_subsys_state *css) +{ + if (!(css->flags & CSS_NO_REF)) + percpu_ref_get(&css->refcnt); +} + +/** + * css_get_many - obtain references on the specified css + * @css: target css + * @n: number of references to get + * + * The caller must already have a reference. + */ +CGROUP_REF_FN_ATTRS +void css_get_many(struct cgroup_subsys_state *css, unsigned int n) +{ + if (!(css->flags & CSS_NO_REF)) + percpu_ref_get_many(&css->refcnt, n); +} + +/** + * css_tryget - try to obtain a reference on the specified css + * @css: target css + * + * Obtain a reference on @css unless it already has reached zero and is + * being released. This function doesn't care whether @css is on or + * offline. The caller naturally needs to ensure that @css is accessible + * but doesn't have to be holding a reference on it - IOW, RCU protected + * access is good enough for this function. Returns %true if a reference + * count was successfully obtained; %false otherwise. + */ +CGROUP_REF_FN_ATTRS +bool css_tryget(struct cgroup_subsys_state *css) +{ + if (!(css->flags & CSS_NO_REF)) + return percpu_ref_tryget(&css->refcnt); + return true; +} + +/** + * css_tryget_online - try to obtain a reference on the specified css if online + * @css: target css + * + * Obtain a reference on @css if it's online. The caller naturally needs + * to ensure that @css is accessible but doesn't have to be holding a + * reference on it - IOW, RCU protected access is good enough for this + * function. Returns %true if a reference count was successfully obtained; + * %false otherwise. + */ +CGROUP_REF_FN_ATTRS +bool css_tryget_online(struct cgroup_subsys_state *css) +{ + if (!(css->flags & CSS_NO_REF)) + return percpu_ref_tryget_live(&css->refcnt); + return true; +} + +/** + * css_put - put a css reference + * @css: target css + * + * Put a reference obtained via css_get() and css_tryget_online(). + */ +CGROUP_REF_FN_ATTRS +void css_put(struct cgroup_subsys_state *css) +{ + if (!(css->flags & CSS_NO_REF)) + percpu_ref_put(&css->refcnt); +} + +/** + * css_put_many - put css references + * @css: target css + * @n: number of references to put + * + * Put references obtained via css_get() and css_tryget_online(). + */ +CGROUP_REF_FN_ATTRS +void css_put_many(struct cgroup_subsys_state *css, unsigned int n) +{ + if (!(css->flags & CSS_NO_REF)) + percpu_ref_put_many(&css->refcnt, n); +} -- cgit From 03699f271de1f4df6369cd379506539cd7d590d3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Sep 2022 14:33:44 -0700 Subject: string: Rewrite and add more kern-doc for the str*() functions While there were varying degrees of kern-doc for various str*()-family functions, many needed updating and clarification, or to just be entirely written. Update (and relocate) existing kern-doc and add missing functions, sadly shaking my head at how many times I have written "Do not use this function". Include the results in the core kernel API doc. Cc: Bagas Sanjaya Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Andrew Morton Cc: linux-hardening@vger.kernel.org Tested-by: Akira Yokosawa Link: https://lore.kernel.org/lkml/9b0cf584-01b3-3013-b800-1ef59fe82476@gmail.com Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 133 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 123 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0f00a551939a..e5b39b1cc2fc 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -106,13 +106,13 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * - * +--------------------+-----------------+------------+ - * | @p needs to be: | padded to @size | not padded | - * +====================+=================+============+ - * | NUL-terminated | strscpy_pad() | strscpy() | - * +--------------------+-----------------+------------+ - * | not NUL-terminated | strtomem_pad() | strtomem() | - * +--------------------+-----------------+------------+ + * +--------------------+--------------------+------------+ + * | **p** needs to be: | padded to **size** | not padded | + * +====================+====================+============+ + * | NUL-terminated | strscpy_pad() | strscpy() | + * +--------------------+--------------------+------------+ + * | not NUL-terminated | strtomem_pad() | strtomem() | + * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with @@ -131,6 +131,21 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) return __underlying_strncpy(p, q, size); } +/** + * strcat - Append a string to an existing string + * + * @p: pointer to NUL-terminated string to append to + * @q: pointer to NUL-terminated source string to append from + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * read and write overflows, this is only possible when the + * destination buffer size is known to the compiler. Prefer + * building the string with formatting, via scnprintf() or similar. + * At the very least, use strncat(). + * + * Returns @p. + * + */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { @@ -144,6 +159,16 @@ char *strcat(char * const POS p, const char *q) } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +/** + * strnlen - Return bounded count of characters in a NUL-terminated string + * + * @p: pointer to NUL-terminated string to count. + * @maxlen: maximum number of characters to count. + * + * Returns number of characters in @p (NOT including the final NUL), or + * @maxlen, if no NUL has been found up to there. + * + */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { size_t p_size = __member_size(p); @@ -169,6 +194,19 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ +/** + * strlen - Return count of characters in a NUL-terminated string + * + * @p: pointer to NUL-terminated string to count. + * + * Do not use this function unless the string length is known at + * compile-time. When @p is unterminated, this function may crash + * or return unexpected counts that could lead to memory content + * exposures. Prefer strnlen(). + * + * Returns number of characters in @p (NOT including the final NUL). + * + */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) @@ -187,8 +225,26 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return ret; } -/* defined after fortified strlen to reuse it */ +/* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); +/** + * strlcpy - Copy a string into another string buffer + * + * @p: pointer to destination of copy + * @q: pointer to NUL-terminated source string to copy + * @size: maximum number of bytes to write at @p + * + * If strlen(@q) >= @size, the copy of @q will be truncated at + * @size - 1 bytes. @p will always be NUL-terminated. + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * over-reads when calculating strlen(@q), it is still possible. + * Prefer strscpy(), though note its different return values for + * detecting truncation. + * + * Returns total number of bytes written to @p, including terminating NUL. + * + */ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) { size_t p_size = __member_size(p); @@ -214,8 +270,32 @@ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, si return q_len; } -/* defined after fortified strnlen to reuse it */ +/* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); +/** + * strscpy - Copy a C-string into a sized buffer + * + * @p: Where to copy the string to + * @q: Where to copy the string from + * @size: Size of destination buffer + * + * Copy the source string @p, or as much of it as fits, into the destination + * @q buffer. The behavior is undefined if the string buffers overlap. The + * destination @p buffer is always NUL terminated, unless it's zero-sized. + * + * Preferred to strlcpy() since the API doesn't require reading memory + * from the source @q string beyond the specified @size bytes, and since + * the return value is easier to error-check than strlcpy()'s. + * In addition, the implementation is robust to the string changing out + * from underneath it, unlike the current strlcpy() implementation. + * + * Preferred to strncpy() since it always returns a valid string, and + * doesn't unnecessarily force the tail of the destination buffer to be + * zero padded. If padding is desired please use strscpy_pad(). + * + * Returns the number of characters copied in @p (not including the + * trailing %NUL) or -E2BIG if @size is 0 or the copy of @q was truncated. + */ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, size_t size) { size_t len; @@ -261,7 +341,26 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s return __real_strscpy(p, q, len); } -/* defined after fortified strlen and strnlen to reuse them */ +/** + * strncat - Append a string to an existing string + * + * @p: pointer to NUL-terminated string to append to + * @q: pointer to source string to append from + * @count: Maximum bytes to read from @q + * + * Appends at most @count bytes from @q (stopping at the first + * NUL byte) after the NUL-terminated string at @p. @p will be + * NUL-terminated. + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * read and write overflows, this is only possible when the sizes + * of @p and @q are known to the compiler. Prefer building the + * string with formatting, via scnprintf() or similar. + * + * Returns @p. + * + */ +/* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { @@ -572,6 +671,20 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp return __real_kmemdup(p, size, gfp); } +/** + * strcpy - Copy a string into another string buffer + * + * @p: pointer to destination of copy + * @q: pointer to NUL-terminated source string to copy + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * overflows, this is only possible when the sizes of @q and @p are + * known to the compiler. Prefer strscpy(), though note its different + * return values for detecting truncation. + * + * Returns @p. + * + */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) -- cgit From 9c5a170677c3c8facc83e931a57f4c99c0511ae0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:10:37 +0100 Subject: net: phylink: add phylink_get_link_timer_ns() helper Add a helper to convert the PHY interface mode to the required link timer setting as stated by the appropriate standard. Inappropriate interface modes return an error. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 63800bf4a7ac..1df3e5e316e8 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -616,6 +616,30 @@ int phylink_speed_up(struct phylink *pl); void phylink_set_port_modes(unsigned long *bits); +/** + * phylink_get_link_timer_ns - return the PCS link timer value + * @interface: link &typedef phy_interface_t mode + * + * Return the PCS link timer setting in nanoseconds for the PHY @interface + * mode, or -EINVAL if not appropriate. + */ +static inline int phylink_get_link_timer_ns(phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_USXGMII: + return 1600000; + + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + return 10000000; + + default: + return -EINVAL; + } +} + void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, -- cgit From 17dd361119e5bc4cfb85aed660674a846b613817 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:21:16 +0100 Subject: net: sfp: convert register indexes from hex to decimal The register indexes in the standards are in decimal rather than hex, so lets specify them in decimal in the header file so we can easily cross-reference without converting between hex and decimal. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 180 ++++++++++++++++++++++++++-------------------------- 1 file changed, 90 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 01ae9f1dd2ad..4e2db155664d 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -332,37 +332,37 @@ enum { /* SFP EEPROM registers */ enum { - SFP_PHYS_ID = 0x00, - SFP_PHYS_EXT_ID = 0x01, - SFP_CONNECTOR = 0x02, - SFP_COMPLIANCE = 0x03, - SFP_ENCODING = 0x0b, - SFP_BR_NOMINAL = 0x0c, - SFP_RATE_ID = 0x0d, - SFP_LINK_LEN_SM_KM = 0x0e, - SFP_LINK_LEN_SM_100M = 0x0f, - SFP_LINK_LEN_50UM_OM2_10M = 0x10, - SFP_LINK_LEN_62_5UM_OM1_10M = 0x11, - SFP_LINK_LEN_COPPER_1M = 0x12, - SFP_LINK_LEN_50UM_OM4_10M = 0x12, - SFP_LINK_LEN_50UM_OM3_10M = 0x13, - SFP_VENDOR_NAME = 0x14, - SFP_VENDOR_OUI = 0x25, - SFP_VENDOR_PN = 0x28, - SFP_VENDOR_REV = 0x38, - SFP_OPTICAL_WAVELENGTH_MSB = 0x3c, - SFP_OPTICAL_WAVELENGTH_LSB = 0x3d, - SFP_CABLE_SPEC = 0x3c, - SFP_CC_BASE = 0x3f, - SFP_OPTIONS = 0x40, /* 2 bytes, MSB, LSB */ - SFP_BR_MAX = 0x42, - SFP_BR_MIN = 0x43, - SFP_VENDOR_SN = 0x44, - SFP_DATECODE = 0x54, - SFP_DIAGMON = 0x5c, - SFP_ENHOPTS = 0x5d, - SFP_SFF8472_COMPLIANCE = 0x5e, - SFP_CC_EXT = 0x5f, + SFP_PHYS_ID = 0, + SFP_PHYS_EXT_ID = 1, + SFP_CONNECTOR = 2, + SFP_COMPLIANCE = 3, + SFP_ENCODING = 11, + SFP_BR_NOMINAL = 12, + SFP_RATE_ID = 13, + SFP_LINK_LEN_SM_KM = 14, + SFP_LINK_LEN_SM_100M = 15, + SFP_LINK_LEN_50UM_OM2_10M = 16, + SFP_LINK_LEN_62_5UM_OM1_10M = 17, + SFP_LINK_LEN_COPPER_1M = 18, + SFP_LINK_LEN_50UM_OM4_10M = 18, + SFP_LINK_LEN_50UM_OM3_10M = 19, + SFP_VENDOR_NAME = 20, + SFP_VENDOR_OUI = 37, + SFP_VENDOR_PN = 40, + SFP_VENDOR_REV = 56, + SFP_OPTICAL_WAVELENGTH_MSB = 60, + SFP_OPTICAL_WAVELENGTH_LSB = 61, + SFP_CABLE_SPEC = 60, + SFP_CC_BASE = 63, + SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ + SFP_BR_MAX = 66, + SFP_BR_MIN = 67, + SFP_VENDOR_SN = 68, + SFP_DATECODE = 84, + SFP_DIAGMON = 92, + SFP_ENHOPTS = 93, + SFP_SFF8472_COMPLIANCE = 94, + SFP_CC_EXT = 95, SFP_PHYS_EXT_ID_SFP = 0x04, SFP_OPTIONS_HIGH_POWER_LEVEL = BIT(13), @@ -404,63 +404,63 @@ enum { /* SFP Diagnostics */ enum { /* Alarm and warnings stored MSB at lower address then LSB */ - SFP_TEMP_HIGH_ALARM = 0x00, - SFP_TEMP_LOW_ALARM = 0x02, - SFP_TEMP_HIGH_WARN = 0x04, - SFP_TEMP_LOW_WARN = 0x06, - SFP_VOLT_HIGH_ALARM = 0x08, - SFP_VOLT_LOW_ALARM = 0x0a, - SFP_VOLT_HIGH_WARN = 0x0c, - SFP_VOLT_LOW_WARN = 0x0e, - SFP_BIAS_HIGH_ALARM = 0x10, - SFP_BIAS_LOW_ALARM = 0x12, - SFP_BIAS_HIGH_WARN = 0x14, - SFP_BIAS_LOW_WARN = 0x16, - SFP_TXPWR_HIGH_ALARM = 0x18, - SFP_TXPWR_LOW_ALARM = 0x1a, - SFP_TXPWR_HIGH_WARN = 0x1c, - SFP_TXPWR_LOW_WARN = 0x1e, - SFP_RXPWR_HIGH_ALARM = 0x20, - SFP_RXPWR_LOW_ALARM = 0x22, - SFP_RXPWR_HIGH_WARN = 0x24, - SFP_RXPWR_LOW_WARN = 0x26, - SFP_LASER_TEMP_HIGH_ALARM = 0x28, - SFP_LASER_TEMP_LOW_ALARM = 0x2a, - SFP_LASER_TEMP_HIGH_WARN = 0x2c, - SFP_LASER_TEMP_LOW_WARN = 0x2e, - SFP_TEC_CUR_HIGH_ALARM = 0x30, - SFP_TEC_CUR_LOW_ALARM = 0x32, - SFP_TEC_CUR_HIGH_WARN = 0x34, - SFP_TEC_CUR_LOW_WARN = 0x36, - SFP_CAL_RXPWR4 = 0x38, - SFP_CAL_RXPWR3 = 0x3c, - SFP_CAL_RXPWR2 = 0x40, - SFP_CAL_RXPWR1 = 0x44, - SFP_CAL_RXPWR0 = 0x48, - SFP_CAL_TXI_SLOPE = 0x4c, - SFP_CAL_TXI_OFFSET = 0x4e, - SFP_CAL_TXPWR_SLOPE = 0x50, - SFP_CAL_TXPWR_OFFSET = 0x52, - SFP_CAL_T_SLOPE = 0x54, - SFP_CAL_T_OFFSET = 0x56, - SFP_CAL_V_SLOPE = 0x58, - SFP_CAL_V_OFFSET = 0x5a, - SFP_CHKSUM = 0x5f, - - SFP_TEMP = 0x60, - SFP_VCC = 0x62, - SFP_TX_BIAS = 0x64, - SFP_TX_POWER = 0x66, - SFP_RX_POWER = 0x68, - SFP_LASER_TEMP = 0x6a, - SFP_TEC_CUR = 0x6c, - - SFP_STATUS = 0x6e, + SFP_TEMP_HIGH_ALARM = 0, + SFP_TEMP_LOW_ALARM = 2, + SFP_TEMP_HIGH_WARN = 4, + SFP_TEMP_LOW_WARN = 6, + SFP_VOLT_HIGH_ALARM = 8, + SFP_VOLT_LOW_ALARM = 10, + SFP_VOLT_HIGH_WARN = 12, + SFP_VOLT_LOW_WARN = 14, + SFP_BIAS_HIGH_ALARM = 16, + SFP_BIAS_LOW_ALARM = 18, + SFP_BIAS_HIGH_WARN = 20, + SFP_BIAS_LOW_WARN = 22, + SFP_TXPWR_HIGH_ALARM = 24, + SFP_TXPWR_LOW_ALARM = 26, + SFP_TXPWR_HIGH_WARN = 28, + SFP_TXPWR_LOW_WARN = 30, + SFP_RXPWR_HIGH_ALARM = 32, + SFP_RXPWR_LOW_ALARM = 34, + SFP_RXPWR_HIGH_WARN = 36, + SFP_RXPWR_LOW_WARN = 38, + SFP_LASER_TEMP_HIGH_ALARM = 40, + SFP_LASER_TEMP_LOW_ALARM = 42, + SFP_LASER_TEMP_HIGH_WARN = 44, + SFP_LASER_TEMP_LOW_WARN = 46, + SFP_TEC_CUR_HIGH_ALARM = 48, + SFP_TEC_CUR_LOW_ALARM = 50, + SFP_TEC_CUR_HIGH_WARN = 52, + SFP_TEC_CUR_LOW_WARN = 54, + SFP_CAL_RXPWR4 = 56, + SFP_CAL_RXPWR3 = 60, + SFP_CAL_RXPWR2 = 64, + SFP_CAL_RXPWR1 = 68, + SFP_CAL_RXPWR0 = 72, + SFP_CAL_TXI_SLOPE = 76, + SFP_CAL_TXI_OFFSET = 78, + SFP_CAL_TXPWR_SLOPE = 80, + SFP_CAL_TXPWR_OFFSET = 82, + SFP_CAL_T_SLOPE = 84, + SFP_CAL_T_OFFSET = 86, + SFP_CAL_V_SLOPE = 88, + SFP_CAL_V_OFFSET = 90, + SFP_CHKSUM = 95, + + SFP_TEMP = 96, + SFP_VCC = 98, + SFP_TX_BIAS = 100, + SFP_TX_POWER = 102, + SFP_RX_POWER = 104, + SFP_LASER_TEMP = 106, + SFP_TEC_CUR = 108, + + SFP_STATUS = 110, SFP_STATUS_TX_DISABLE = BIT(7), SFP_STATUS_TX_DISABLE_FORCE = BIT(6), SFP_STATUS_TX_FAULT = BIT(2), SFP_STATUS_RX_LOS = BIT(1), - SFP_ALARM0 = 0x70, + SFP_ALARM0 = 112, SFP_ALARM0_TEMP_HIGH = BIT(7), SFP_ALARM0_TEMP_LOW = BIT(6), SFP_ALARM0_VCC_HIGH = BIT(5), @@ -470,11 +470,11 @@ enum { SFP_ALARM0_TXPWR_HIGH = BIT(1), SFP_ALARM0_TXPWR_LOW = BIT(0), - SFP_ALARM1 = 0x71, + SFP_ALARM1 = 113, SFP_ALARM1_RXPWR_HIGH = BIT(7), SFP_ALARM1_RXPWR_LOW = BIT(6), - SFP_WARN0 = 0x74, + SFP_WARN0 = 116, SFP_WARN0_TEMP_HIGH = BIT(7), SFP_WARN0_TEMP_LOW = BIT(6), SFP_WARN0_VCC_HIGH = BIT(5), @@ -484,15 +484,15 @@ enum { SFP_WARN0_TXPWR_HIGH = BIT(1), SFP_WARN0_TXPWR_LOW = BIT(0), - SFP_WARN1 = 0x75, + SFP_WARN1 = 117, SFP_WARN1_RXPWR_HIGH = BIT(7), SFP_WARN1_RXPWR_LOW = BIT(6), - SFP_EXT_STATUS = 0x76, + SFP_EXT_STATUS = 118, SFP_EXT_STATUS_PWRLVL_SELECT = BIT(0), - SFP_VSL = 0x78, - SFP_PAGE = 0x7f, + SFP_VSL = 120, + SFP_PAGE = 127, }; struct fwnode_handle; -- cgit From d83845d224a059165902ecd0e1203015c5713a78 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:21:21 +0100 Subject: net: sfp: move field definitions along side register index Just as we do for the A2h enum, arrange the A0h enum to have the field definitions next to their corresponding register index. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 4e2db155664d..52b98f9666a2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -333,7 +333,10 @@ enum { /* SFP EEPROM registers */ enum { SFP_PHYS_ID = 0, + SFP_PHYS_EXT_ID = 1, + SFP_PHYS_EXT_ID_SFP = 0x04, + SFP_CONNECTOR = 2, SFP_COMPLIANCE = 3, SFP_ENCODING = 11, @@ -354,17 +357,8 @@ enum { SFP_OPTICAL_WAVELENGTH_LSB = 61, SFP_CABLE_SPEC = 60, SFP_CC_BASE = 63, - SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ - SFP_BR_MAX = 66, - SFP_BR_MIN = 67, - SFP_VENDOR_SN = 68, - SFP_DATECODE = 84, - SFP_DIAGMON = 92, - SFP_ENHOPTS = 93, - SFP_SFF8472_COMPLIANCE = 94, - SFP_CC_EXT = 95, - SFP_PHYS_EXT_ID_SFP = 0x04, + SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ SFP_OPTIONS_HIGH_POWER_LEVEL = BIT(13), SFP_OPTIONS_PAGING_A2 = BIT(12), SFP_OPTIONS_RETIMER = BIT(11), @@ -378,11 +372,20 @@ enum { SFP_OPTIONS_TX_FAULT = BIT(3), SFP_OPTIONS_LOS_INVERTED = BIT(2), SFP_OPTIONS_LOS_NORMAL = BIT(1), + + SFP_BR_MAX = 66, + SFP_BR_MIN = 67, + SFP_VENDOR_SN = 68, + SFP_DATECODE = 84, + + SFP_DIAGMON = 92, SFP_DIAGMON_DDM = BIT(6), SFP_DIAGMON_INT_CAL = BIT(5), SFP_DIAGMON_EXT_CAL = BIT(4), SFP_DIAGMON_RXPWR_AVG = BIT(3), SFP_DIAGMON_ADDRMODE = BIT(2), + + SFP_ENHOPTS = 93, SFP_ENHOPTS_ALARMWARN = BIT(7), SFP_ENHOPTS_SOFT_TX_DISABLE = BIT(6), SFP_ENHOPTS_SOFT_TX_FAULT = BIT(5), @@ -390,6 +393,8 @@ enum { SFP_ENHOPTS_SOFT_RATE_SELECT = BIT(3), SFP_ENHOPTS_APP_SELECT_SFF8079 = BIT(2), SFP_ENHOPTS_SOFT_RATE_SFF8431 = BIT(1), + + SFP_SFF8472_COMPLIANCE = 94, SFP_SFF8472_COMPLIANCE_NONE = 0x00, SFP_SFF8472_COMPLIANCE_REV9_3 = 0x01, SFP_SFF8472_COMPLIANCE_REV9_5 = 0x02, @@ -399,6 +404,8 @@ enum { SFP_SFF8472_COMPLIANCE_REV11_3 = 0x06, SFP_SFF8472_COMPLIANCE_REV11_4 = 0x07, SFP_SFF8472_COMPLIANCE_REV12_0 = 0x08, + + SFP_CC_EXT = 95, }; /* SFP Diagnostics */ -- cgit From 2fcd7bbae90a6d844da8660a9d27079281dfbba2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 14 Oct 2022 19:05:51 +0800 Subject: sched/psi: Fix avgs_work re-arm in psi_avgs_work() Pavan reported a problem that PSI avgs_work idle shutoff is not working at all. Because PSI_NONIDLE condition would be observed in psi_avgs_work()->collect_percpu_times()->get_recent_times() even if only the kworker running avgs_work on the CPU. Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off") avoided the ping-pong wake problem when the worker sleep, psi_avgs_work() still will always re-arm the avgs_work, so shutoff is not working. This patch changes to use PSI_STATE_RESCHEDULE to flag whether to re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs we can just check PSI_NONIDLE delta. The new flag is only used in psi_avgs_work(), so we check in get_recent_times() that current_work() is avgs_work. One potential problem is that the brief period of non-idle time incurred between the aggregation run and the kworker's dequeue will be stranded in the per-cpu buckets until avgs_work run next time. The buckets can hold 4s worth of time, and future activity will wake the avgs_work with a 2s delay, giving us 2s worth of data we can leave behind when shut off the avgs_work. If the kworker run other works after avgs_work shut off and doesn't have any scheduler activities for 2s, this maybe a problem. Reported-by: Pavan Kondeti Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Suren Baghdasaryan Tested-by: Chengming Zhou Link: https://lore.kernel.org/r/20221014110551.22695-1-zhouchengming@bytedance.com --- include/linux/psi_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 6e4372735068..325981833587 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -72,6 +72,9 @@ enum psi_states { /* Use one bit in the state mask to track TSK_ONCPU */ #define PSI_ONCPU (1 << NR_PSI_STATES) +/* Flag whether to re-arm avgs_work, see details in get_recent_times() */ +#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1)) + enum psi_aggregators { PSI_AVGS = 0, PSI_POLL, -- cgit From 710ffe671e014d5ccbcff225130a178b088ef090 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 28 Oct 2022 12:45:41 -0700 Subject: sched/psi: Stop relying on timer_pending() for poll_work rescheduling Psi polling mechanism is trying to minimize the number of wakeups to run psi_poll_work and is currently relying on timer_pending() to detect when this work is already scheduled. This provides a window of opportunity for psi_group_change to schedule an immediate psi_poll_work after poll_timer_fn got called but before psi_poll_work could reschedule itself. Below is the depiction of this entire window: poll_timer_fn wake_up_interruptible(&group->poll_wait); psi_poll_worker wait_event_interruptible(group->poll_wait, ...) psi_poll_work psi_schedule_poll_work if (timer_pending(&group->poll_timer)) return; ... mod_timer(&group->poll_timer, jiffies + delay); Prior to 461daba06bdc we used to rely on poll_scheduled atomic which was reset and set back inside psi_poll_work and therefore this race window was much smaller. The larger window causes increased number of wakeups and our partners report visible power regression of ~10mA after applying 461daba06bdc. Bring back the poll_scheduled atomic and make this race window even narrower by resetting poll_scheduled only when we reach polling expiration time. This does not completely eliminate the possibility of extra wakeups caused by a race with psi_group_change however it will limit it to the worst case scenario of one extra wakeup per every tracking window (0.5s in the worst case). This patch also ensures correct ordering between clearing poll_scheduled flag and obtaining changed_states using memory barrier. Correct ordering between updating changed_states and setting poll_scheduled is ensured by atomic_xchg operation. By tracing the number of immediate rescheduling attempts performed by psi_group_change and the number of these attempts being blocked due to psi monitor being already active, we can assess the effects of this change: Before the patch: Run#1 Run#2 Run#3 Immediate reschedules attempted: 684365 1385156 1261240 Immediate reschedules blocked: 682846 1381654 1258682 Immediate reschedules (delta): 1519 3502 2558 Immediate reschedules (% of attempted): 0.22% 0.25% 0.20% After the patch: Run#1 Run#2 Run#3 Immediate reschedules attempted: 882244 770298 426218 Immediate reschedules blocked: 881996 769796 426074 Immediate reschedules (delta): 248 502 144 Immediate reschedules (% of attempted): 0.03% 0.07% 0.03% The number of non-blocked immediate reschedules dropped from 0.22-0.25% to 0.03-0.07%. The drop is attributed to the decrease in the race window size and the fact that we allow this race only when psi monitors reach polling window expiration time. Fixes: 461daba06bdc ("psi: eliminate kthread_worker from psi trigger scheduling mechanism") Reported-by: Kathleen Chang Reported-by: Wenju Xu Reported-by: Jonathan Chen Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Tested-by: SH Chen Link: https://lore.kernel.org/r/20221028194541.813985-1-surenb@google.com --- include/linux/psi_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 325981833587..1e0a0d7ace3a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -180,6 +180,7 @@ struct psi_group { struct timer_list poll_timer; wait_queue_head_t poll_wait; atomic_t poll_wakeup; + atomic_t poll_scheduled; /* Protects data used by the monitor */ struct mutex trigger_lock; -- cgit From 52b33d87b9197c51e8ffdc61873739d90dd0a16f Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 26 Sep 2022 16:19:31 +0800 Subject: sched/psi: Use task->psi_flags to clear in CPU migration The commit d583d360a620 ("psi: Fix psi state corruption when schedule() races with cgroup move") fixed a race problem by making cgroup_move_task() use task->psi_flags instead of looking at the scheduler state. We can extend task->psi_flags usage to CPU migration, which should be a minor optimization for performance and code simplicity. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220926081931.45420-1-zhouchengming@bytedance.com --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..23de7fe86cc4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -888,9 +888,6 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; -#ifdef CONFIG_PSI - unsigned sched_psi_wake_requeue:1; -#endif /* Force alignment to the next boundary: */ unsigned :0; -- cgit From f3fb589aeb88c5bf7a7a804d36a8fa219b72e290 Mon Sep 17 00:00:00 2001 From: Juhee Kang Date: Fri, 28 Oct 2022 01:04:24 +0900 Subject: net: remove unused netdev_unregistering() Currently, use dev->reg_state == NETREG_UNREGISTERING to check the status which is NETREG_UNREGISTERING, rather than using netdev_unregistering. Also, A helper function which is netdev_unregistering on nedevice.h is no longer used. Thus, netdev_unregistering removes from netdevice.h. Signed-off-by: Juhee Kang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eddf8ee270e7..99e58b773266 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5101,11 +5101,6 @@ static inline const char *netdev_name(const struct net_device *dev) return dev->name; } -static inline bool netdev_unregistering(const struct net_device *dev) -{ - return dev->reg_state == NETREG_UNREGISTERING; -} - static inline const char *netdev_reg_state(const struct net_device *dev) { switch (dev->reg_state) { -- cgit From b9a61b97798ca6d0c856a217e61e2177bd8f6eae Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 28 Oct 2022 04:04:12 -0700 Subject: ptp: add missing documentation for parameters The ptp_find_pin_unlocked function and the ptp_system_timestamp structure didn't document their parameters and fields. Fix this. Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 92b44161408e..ad4aaadc2f7a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -45,6 +45,8 @@ struct system_device_crosststamp; /** * struct ptp_system_timestamp - system time corresponding to a PHC timestamp + * @pre_ts: system timestamp before capturing PHC + * @post_ts: system timestamp after capturing PHC */ struct ptp_system_timestamp { struct timespec64 pre_ts; @@ -316,6 +318,11 @@ int ptp_find_pin(struct ptp_clock *ptp, * should most likely call ptp_find_pin() directly from their * ptp_clock_info::enable() method. * +* @ptp: The clock obtained from ptp_clock_register(). +* @func: One of the ptp_pin_function enumerated values. +* @chan: The particular functional channel to find. +* Return: Pin index in the range of zero to ptp_clock_caps.n_pins - 1, +* or -1 if the auxiliary function cannot be found. */ int ptp_find_pin_unlocked(struct ptp_clock *ptp, -- cgit From 1060707e380994e7c9b754b6eb74f25459b4a5b3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 28 Oct 2022 04:04:13 -0700 Subject: ptp: introduce helpers to adjust by scaled parts per million Many drivers implement the .adjfreq or .adjfine PTP op function with the same basic logic: 1. Determine a base frequency value 2. Multiply this by the abs() of the requested adjustment, then divide by the appropriate divisor (1 billion, or 65,536 billion). 3. Add or subtract this difference from the base frequency to calculate a new adjustment. A few drivers need the difference and direction rather than the combined new increment value. I recently converted the Intel drivers to .adjfine and the scaled parts per million (65.536 parts per billion) logic. To avoid overflow with minimal loss of precision, mul_u64_u64_div_u64 was used. The basic logic used by all of these drivers is very similar, and leads to a lot of duplicate code to perform the same task. Rather than keep this duplicate code, introduce diff_by_scaled_ppm and adjust_by_scaled_ppm. These helper functions calculate the difference or adjustment necessary based on the scaled parts per million input. The diff_by_scaled_ppm function returns true if the difference should be subtracted, and false otherwise. Update the Intel drivers to use the new helper functions. Other vendor drivers will be converted to .adjfine and this helper function in the following changes. Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index ad4aaadc2f7a..f4781c5766d6 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -248,6 +248,52 @@ static inline long scaled_ppm_to_ppb(long ppm) return (long)ppb; } +/** + * diff_by_scaled_ppm - Calculate difference using scaled ppm + * @base: the base increment value to adjust + * @scaled_ppm: scaled parts per million to adjust by + * @diff: on return, the absolute value of calculated diff + * + * Calculate the difference to adjust the base increment using scaled parts + * per million. + * + * Use mul_u64_u64_div_u64 to perform the difference calculation in avoid + * possible overflow. + * + * Returns: true if scaled_ppm is negative, false otherwise + */ +static inline bool diff_by_scaled_ppm(u64 base, long scaled_ppm, u64 *diff) +{ + bool negative = false; + + if (scaled_ppm < 0) { + negative = true; + scaled_ppm = -scaled_ppm; + } + + *diff = mul_u64_u64_div_u64(base, (u64)scaled_ppm, 1000000ULL << 16); + + return negative; +} + +/** + * adjust_by_scaled_ppm - Adjust a base increment by scaled parts per million + * @base: the base increment value to adjust + * @scaled_ppm: scaled parts per million frequency adjustment + * + * Helper function which calculates a new increment value based on the + * requested scaled parts per million adjustment. + */ +static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) +{ + u64 diff; + + if (diff_by_scaled_ppm(base, scaled_ppm, &diff)) + return base - diff; + + return base + diff; +} + #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) /** -- cgit From adff215830fcf3ef74f2f0d4dd5a47a6927d450b Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Sun, 30 Oct 2022 13:20:08 +0800 Subject: block: simplify blksize_bits() implementation Convert current looping-based implementation into bit operation, which can bring improvement for: 1) bitops is more efficient for its arch-level optimization. 2) Given that blksize_bits() is inline, _if_ @size is compile-time constant, it's possible that order_base_2() _may_ make output compile-time evaluated, depending on code context and compiler behavior. Signed-off-by: Dawei Li Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/TYCP286MB23238842958D7C083D6B67CECA349@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 57ed49f20d2e..32137d85c9ad 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1349,12 +1349,7 @@ static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { - unsigned int bits = 8; - do { - bits++; - size >>= 1; - } while (size > 256); - return bits; + return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT; } static inline unsigned int block_size(struct block_device *bdev) -- cgit From 256c8aed2b420a7c57ed6469fbb0f8310f5aeec9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 26 Oct 2022 12:51:27 +0200 Subject: fs: introduce dedicated idmap type for mounts Last cycle we've already made the interaction with idmapped mounts more robust and type safe by introducing the vfs{g,u}id_t type. This cycle we concluded the conversion and removed the legacy helpers. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate filesystem and mount namespaces and what different roles they have to play. Especially for filesystem developers without much experience in this area this is an easy source for bugs. Instead of passing the plain namespace we introduce a dedicated type struct mnt_idmap and replace the pointer with a pointer to a struct mnt_idmap. There are no semantic or size changes for the mount struct caused by this. We then start converting all places aware of idmapped mounts to rely on struct mnt_idmap. Once the conversion is done all helpers down to the really low-level make_vfs{g,u}id() and from_vfs{g,u}id() will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two, removing and thus eliminating the possibility of any bugs. Fwiw, I fixed some issues in that area a while ago in ntfs3 and ksmbd in the past. Afterwards, only low-level code can ultimately use the associated namespace for any permission checks. Even most of the vfs can be ultimately completely oblivious about this and filesystems will never interact with it directly in any form in the future. A struct mnt_idmap currently encompasses a simple refcount and a pointer to the relevant namespace the mount is idmapped to. If a mount isn't idmapped then it will point to a static nop_mnt_idmap. If it is an idmapped mount it will point to a new struct mnt_idmap. As usual there are no allocations or anything happening for non-idmapped mounts. Everthing is carefully written to be a nop for non-idmapped mounts as has always been the case. If an idmapped mount or mount tree is created a new struct mnt_idmap is allocated and a reference taken on the relevant namespace. For each mount in a mount tree that gets idmapped or a mount that inherits the idmap when it is cloned the reference count on the associated struct mnt_idmap is bumped. Just a reminder that we only allow a mount to change it's idmapping a single time and only if it hasn't already been attached to the filesystems and has no active writers. The actual changes are fairly straightforward. This will have huge benefits for maintenance and security in the long run even if it causes some churn. I'm aware that there's some cost for all of you. And I'll commit to doing this work and make this as painless as I can. Note that this also makes it possible to extend struct mount_idmap in the future. For example, it would be possible to place the namespace pointer in an anonymous union together with an idmapping struct. This would allow us to expose an api to userspace that would let it specify idmappings directly instead of having to go through the detour of setting up namespaces at all. This just adds the infrastructure and doesn't do any conversions. Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 10 +++++++--- include/linux/mnt_idmapping.h | 8 +++----- include/linux/mount.h | 9 ++++++--- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index cffc402c2451..be56c2192926 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2700,18 +2700,22 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file) return mnt_user_ns(file->f_path.mnt); } +static inline struct mnt_idmap *file_mnt_idmap(struct file *file) +{ + return mnt_idmap(file->f_path.mnt); +} + /** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * - * If @mnt has an idmapping attached different from the - * filesystem's idmapping then @mnt is mapped. + * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { - return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns; + return mnt_idmap(mnt) != &nop_mnt_idmap; } extern long vfs_truncate(const struct path *, loff_t); diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index c8002294a72d..092c52aa6c2c 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -5,12 +5,10 @@ #include #include +struct mnt_idmap; struct user_namespace; -/* - * Carries the initial idmapping of 0:0:4294967295 which is an identity - * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is - * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. - */ + +extern struct mnt_idmap nop_mnt_idmap; extern struct user_namespace init_user_ns; typedef struct { diff --git a/include/linux/mount.h b/include/linux/mount.h index 55a4abaf6715..62475996fac6 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -16,6 +16,7 @@ struct super_block; struct dentry; struct user_namespace; +struct mnt_idmap; struct file_system_type; struct fs_context; struct file; @@ -70,13 +71,15 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; - struct user_namespace *mnt_userns; + struct mnt_idmap *mnt_idmap; } __randomize_layout; -static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) +struct user_namespace *mnt_user_ns(const struct vfsmount *mnt); +struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap); +static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ - return smp_load_acquire(&mnt->mnt_userns); + return smp_load_acquire(&mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); -- cgit From 79a7f41f7f5ac69fd22eaf1fb3e230bea95f3399 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 31 Oct 2022 07:12:13 -1000 Subject: cgroup: cgroup refcnt functions should be exported when CONFIG_DEBUG_CGROUP_REF 6ab428604f72 ("cgroup: Implement DEBUG_CGROUP_REF") added a config option which forces cgroup refcnt functions to be not inlined so that they can be kprobed for debugging. However, it forgot export them when the config is enabled breaking modules which make use of css reference counting. Fix it by adding CGROUP_REF_EXPORT() macro to cgroup_refcnt.h which is defined to EXPORT_SYMBOL_GPL when CONFIG_DEBUG_CGROUP_REF is set. Signed-off-by: Tejun Heo Fixes: 6ab428604f72 ("cgroup: Implement DEBUG_CGROUP_REF") --- include/linux/cgroup.h | 1 + include/linux/cgroup_refcnt.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5c9c07a44706..c8441090ca4c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -318,6 +318,7 @@ void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline +#define CGROUP_REF_EXPORT(fn) #include #endif diff --git a/include/linux/cgroup_refcnt.h b/include/linux/cgroup_refcnt.h index 1aa89295dac0..2eea0a69ecfc 100644 --- a/include/linux/cgroup_refcnt.h +++ b/include/linux/cgroup_refcnt.h @@ -10,6 +10,7 @@ void css_get(struct cgroup_subsys_state *css) if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } +CGROUP_REF_EXPORT(css_get) /** * css_get_many - obtain references on the specified css @@ -24,6 +25,7 @@ void css_get_many(struct cgroup_subsys_state *css, unsigned int n) if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } +CGROUP_REF_EXPORT(css_get_many) /** * css_tryget - try to obtain a reference on the specified css @@ -43,6 +45,7 @@ bool css_tryget(struct cgroup_subsys_state *css) return percpu_ref_tryget(&css->refcnt); return true; } +CGROUP_REF_EXPORT(css_tryget) /** * css_tryget_online - try to obtain a reference on the specified css if online @@ -61,6 +64,7 @@ bool css_tryget_online(struct cgroup_subsys_state *css) return percpu_ref_tryget_live(&css->refcnt); return true; } +CGROUP_REF_EXPORT(css_tryget_online) /** * css_put - put a css reference @@ -74,6 +78,7 @@ void css_put(struct cgroup_subsys_state *css) if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } +CGROUP_REF_EXPORT(css_put) /** * css_put_many - put css references @@ -88,3 +93,4 @@ void css_put_many(struct cgroup_subsys_state *css, unsigned int n) if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } +CGROUP_REF_EXPORT(css_put_many) -- cgit From 1d997f1013079c05b642c739901e3584a3ae558d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:21 -0400 Subject: rtnetlink: pass netlink message header and portid to rtnl_configure_link() This patch pass netlink message header and portid to rtnl_configure_link() All the functions in this call chain need to add the parameters so we can use them in the last call rtnl_notify(), and notify the userspace about the new link info if NLM_F_ECHO flag is set. - rtnl_configure_link() - __dev_notify_flags() - rtmsg_ifinfo() - rtmsg_ifinfo_event() - rtmsg_ifinfo_build_skb() - rtmsg_ifinfo_send() - rtnl_notify() Also move __dev_notify_flags() declaration to net/core/dev.h, as Jakub suggested. Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/linux/rtnetlink.h | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 99e58b773266..4b5052db978f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3855,8 +3855,6 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); -void __dev_notify_flags(struct net_device *, unsigned int old_flags, - unsigned int gchanges); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ae2c6a3cec5d..92ad75549e9c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -12,21 +12,22 @@ extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, - u32 group, struct nlmsghdr *nlh, gfp_t flags); + u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, + u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex); + int new_ifindex, u32 portid, u32 seq); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, - gfp_t flags); + gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ -- cgit From 931ab63664f02b17d2213ef36b83e1e50190a0aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:14 +0200 Subject: x86/ibt: Implement FineIBT Implement an alternative CFI scheme that merges both the fine-grained nature of kCFI but also takes full advantage of the coarse grained hardware CFI as provided by IBT. To contrast: kCFI is a pure software CFI scheme and relies on being able to read text -- specifically the instruction *before* the target symbol, and does the hash validation *before* doing the call (otherwise control flow is compromised already). FineIBT is a software and hardware hybrid scheme; by ensuring every branch target starts with a hash validation it is possible to place the hash validation after the branch. This has several advantages: o the (hash) load is avoided; no memop; no RX requirement. o IBT WAIT-FOR-ENDBR state is a speculation stop; by placing the hash validation in the immediate instruction after the branch target there is a minimal speculation window and the whole is a viable defence against SpectreBHB. o Kees feels obliged to mention it is slightly more vulnerable when the attacker can write code. Obviously this patch relies on kCFI, but additionally it also relies on the padding from the call-depth-tracking patches. It uses this padding to place the hash-validation while the call-sites are re-written to modify the indirect target to be 16 bytes in front of the original target, thus hitting this new preamble. Notably, there is no hardware that needs call-depth-tracking (Skylake) and supports IBT (Tigerlake and onwards). Suggested-by: Joao Moreira (Intel) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.634714496@infradead.org --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5296aea9b5b4..923a3d508047 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) #else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) -- cgit From 9e4a617757273a86b560c1ece40c48e4940a3c79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 Sep 2022 02:24:53 -0700 Subject: string: Add __realloc_size hint to kmemdup() Add __realloc_size() hint to kmemdup() so the compiler can reason about the length of the returned buffer. (These must not use __alloc_size, since those include __malloc which says the contents aren't defined[1]). [1] https://lore.kernel.org/linux-hardening/d199c2af-06af-8a50-a6a1-00eefa0b67b4@prevas.dk/ Cc: Rasmus Villemoes Cc: Guenter Roeck Cc: Andy Shevchenko Cc: Paolo Abeni Cc: Geert Uytterhoeven Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 3 ++- include/linux/string.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index e5b39b1cc2fc..49782f63f015 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -659,7 +659,8 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) return __real_memchr_inv(p, c, size); } -extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) + __realloc_size(2); __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) { size_t p_size = __struct_size(p); diff --git a/include/linux/string.h b/include/linux/string.h index cf7607b32102..db28802ab0a6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -176,7 +176,7 @@ extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup(const void *src, size_t len, gfp_t gfp); +extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); -- cgit From 62e1cbfc5d795381a0f237ae7ee229a92d51cf9e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 2 Oct 2022 09:17:03 -0700 Subject: fortify: Short-circuit known-safe calls to strscpy() Replacing compile-time safe calls of strcpy()-related functions with strscpy() was always calling the full strscpy() logic when a builtin would be better. For example: char buf[16]; strcpy(buf, "yes"); would reduce to __builtin_memcpy(buf, "yes", 4), but not if it was: strscpy(buf, yes, sizeof(buf)); Fix this by checking if all sizes are known at compile-time. Cc: linux-hardening@vger.kernel.org Tested-by: Nathan Chancellor Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 49782f63f015..32a66d4b30ca 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -314,6 +314,16 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s if (__compiletime_lessthan(p_size, size)) __write_overflow(); + /* Short-circuit for compile-time known-safe lengths. */ + if (__compiletime_lessthan(p_size, SIZE_MAX)) { + len = __compiletime_strlen(q); + + if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { + __underlying_memcpy(p, q, len + 1); + return len; + } + } + /* * This call protects from read overflow, because len will default to q * length if it smaller than size. -- cgit From e9a40e1585d792751d3a122392695e5a53032809 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 25 Oct 2022 16:05:18 -0700 Subject: fortify: Do not cast to "unsigned char" Do not cast to "unsigned char", as this needlessly creates type problems when attempting builds without -Wno-pointer-sign[1]. The intent of the cast is to drop possible "const" types. [1] https://lore.kernel.org/lkml/CAHk-=wgz3Uba8w7kdXhsqR1qvfemYL+OFQdefJnkeqXG8qZ_pA@mail.gmail.com/ Suggested-by: Linus Torvalds Fixes: 3009f891bb9f ("fortify: Allow strlen() and strnlen() to pass compile-time known lengths") Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 32a66d4b30ca..aa31f54f8b57 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -18,7 +18,7 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning(" #define __compiletime_strlen(p) \ ({ \ - unsigned char *__p = (unsigned char *)(p); \ + char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ -- cgit From 00208852d351ca6e4a8b9ff0c5376fa3a8ed8eaa Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:01:22 -0700 Subject: iommu: Add return value rules to attach_dev op and APIs Cases like VFIO wish to attach a device to an existing domain that was not allocated specifically from the device. This raises a condition where the IOMMU driver can fail the domain attach because the domain and device are incompatible with each other. This is a soft failure that can be resolved by using a different domain. Provide a dedicated errno EINVAL from the IOMMU driver during attach that the reason why the attach failed is because of domain incompatibility. VFIO can use this to know that the attach is a soft failure and it should continue searching. Otherwise, the attach will be a hard failure and VFIO will return the code to userspace. Update kdocs to add rules of return value to the attach_dev op and APIs. Link: https://lore.kernel.org/r/bd56d93c18621104a0fa1b0de31e9b760b81b769.1666042872.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e..857898d102b3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -266,6 +266,18 @@ struct iommu_ops { /** * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device + * Return: + * * 0 - success + * * EINVAL - can indicate that device and domain are incompatible due to + * some previous configuration of the domain, in which case the + * driver shouldn't log an error, since it is legitimate for a + * caller to test reuse of existing domains. Otherwise, it may + * still represent some other fundamental problem + * * ENOMEM - out of memory + * * ENOSPC - non-ENOMEM type of resource allocation failures + * * EBUSY - device is attached to a domain and cannot be changed + * * ENODEV - device specific errors, not able to be attached + * * - treated as ENODEV by the caller. Use is discouraged * @detach_dev: detach an iommu domain from a device * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to -- cgit From b8d3b056a78dcc941fd1a117697ab2b956c2953f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 11 Oct 2022 17:22:04 +0800 Subject: spi: introduce new helpers with using modern naming For using modern names host/target to instead of all the legacy names, I think it takes 3 steps: - step1: introduce new helpers with modern naming. - step2: switch to use these new helpers in all drivers. - step3: remove all legacy helpers and update all legacy names. This patch is for step1, it introduces new helpers with host/target naming for drivers using. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20221011092204.950288-1-yangyingliang@huawei.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8fe3d0a9d2c9..798c30229e07 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -356,6 +356,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @max_speed_hz: Highest supported transfer speed * @flags: other constraints relevant to this driver * @slave: indicates that this is an SPI slave controller + * @target: indicates that this is an SPI target controller * @devm_allocated: whether the allocation of this struct is devres-managed * @max_transfer_size: function that returns the max transfer size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. @@ -440,6 +441,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @mem_caps: controller capabilities for the handling of memory operations. * @unprepare_message: undo any work done by prepare_message(). * @slave_abort: abort the ongoing transfer request on an SPI slave controller + * @target_abort: abort the ongoing transfer request on an SPI target controller * @cs_gpiods: Array of GPIO descs to use as chip select lines; one per CS * number. Any individual value may be NULL for CS lines that * are not GPIOs (driven by the SPI controller itself). @@ -535,8 +537,12 @@ struct spi_controller { /* Flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; - /* Flag indicating this is an SPI slave controller */ - bool slave; + union { + /* Flag indicating this is an SPI slave controller */ + bool slave; + /* Flag indicating this is an SPI target controller */ + bool target; + }; /* * on some hardware transfer / message size may be constrained @@ -650,6 +656,7 @@ struct spi_controller { int (*unprepare_message)(struct spi_controller *ctlr, struct spi_message *message); int (*slave_abort)(struct spi_controller *ctlr); + int (*target_abort)(struct spi_controller *ctlr); /* * These hooks are for drivers that use a generic implementation @@ -727,6 +734,11 @@ static inline bool spi_controller_is_slave(struct spi_controller *ctlr) return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->slave; } +static inline bool spi_controller_is_target(struct spi_controller *ctlr) +{ + return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->target; +} + /* PM calls that need to be issued by the driver */ extern int spi_controller_suspend(struct spi_controller *ctlr); extern int spi_controller_resume(struct spi_controller *ctlr); @@ -763,6 +775,21 @@ static inline struct spi_controller *spi_alloc_slave(struct device *host, return __spi_alloc_controller(host, size, true); } +static inline struct spi_controller *spi_alloc_host(struct device *dev, + unsigned int size) +{ + return __spi_alloc_controller(dev, size, false); +} + +static inline struct spi_controller *spi_alloc_target(struct device *dev, + unsigned int size) +{ + if (!IS_ENABLED(CONFIG_SPI_SLAVE)) + return NULL; + + return __spi_alloc_controller(dev, size, true); +} + struct spi_controller *__devm_spi_alloc_controller(struct device *dev, unsigned int size, bool slave); @@ -782,6 +809,21 @@ static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev, return __devm_spi_alloc_controller(dev, size, true); } +static inline struct spi_controller *devm_spi_alloc_host(struct device *dev, + unsigned int size) +{ + return __devm_spi_alloc_controller(dev, size, false); +} + +static inline struct spi_controller *devm_spi_alloc_target(struct device *dev, + unsigned int size) +{ + if (!IS_ENABLED(CONFIG_SPI_SLAVE)) + return NULL; + + return __devm_spi_alloc_controller(dev, size, true); +} + extern int spi_register_controller(struct spi_controller *ctlr); extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); @@ -1141,6 +1183,7 @@ static inline void spi_message_free(struct spi_message *m) extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); extern int spi_slave_abort(struct spi_device *spi); +extern int spi_target_abort(struct spi_device *spi); static inline size_t spi_max_message_size(struct spi_device *spi) -- cgit From d182dc6de93225cd853de4db68a1a77501bedb6e Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Mon, 24 Oct 2022 13:39:23 +0900 Subject: cpufreq: Generalize of_perf_domain_get_sharing_cpumask phandle format of_perf_domain_get_sharing_cpumask currently assumes a 1-argument phandle format, and directly returns the argument. Generalize this to return the full of_phandle_args, so it can be used by drivers which use other phandle styles (e.g. separate nodes). This also requires changing the CPU sharing match to compare the full args structure. Also, make sure to of_node_put(args.np) (the original code was leaking a reference). Signed-off-by: Hector Martin Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d5595d57f4e5..6a94a6eaad27 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1110,10 +1110,10 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy, } static inline int parse_perf_domain(int cpu, const char *list_name, - const char *cell_name) + const char *cell_name, + struct of_phandle_args *args) { struct device_node *cpu_np; - struct of_phandle_args args; int ret; cpu_np = of_cpu_device_node_get(cpu); @@ -1121,41 +1121,44 @@ static inline int parse_perf_domain(int cpu, const char *list_name, return -ENODEV; ret = of_parse_phandle_with_args(cpu_np, list_name, cell_name, 0, - &args); + args); if (ret < 0) return ret; of_node_put(cpu_np); - return args.args[0]; + return 0; } static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, - const char *cell_name, struct cpumask *cpumask) + const char *cell_name, struct cpumask *cpumask, + struct of_phandle_args *pargs) { - int target_idx; int cpu, ret; + struct of_phandle_args args; - ret = parse_perf_domain(pcpu, list_name, cell_name); + ret = parse_perf_domain(pcpu, list_name, cell_name, pargs); if (ret < 0) return ret; - target_idx = ret; cpumask_set_cpu(pcpu, cpumask); for_each_possible_cpu(cpu) { if (cpu == pcpu) continue; - ret = parse_perf_domain(cpu, list_name, cell_name); + ret = parse_perf_domain(cpu, list_name, cell_name, &args); if (ret < 0) continue; - if (target_idx == ret) + if (pargs->np == args.np && pargs->args_count == args.args_count && + !memcmp(pargs->args, args.args, sizeof(args.args[0]) * args.args_count)) cpumask_set_cpu(cpu, cpumask); + + of_node_put(args.np); } - return target_idx; + return 0; } #else static inline int cpufreq_boost_trigger_state(int state) @@ -1185,7 +1188,8 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy, } static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, - const char *cell_name, struct cpumask *cpumask) + const char *cell_name, struct cpumask *cpumask, + struct of_phandle_args *pargs) { return -EOPNOTSUPP; } -- cgit From a521075d0ab389ec5e1b52a2af01ad41b3cf9792 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Oct 2022 18:29:46 +0300 Subject: device property: Introduce fwnode_device_is_compatible() helper The fwnode_device_is_compatible() helper searches for the given string in the "compatible" string array property and, if found, returns true. Signed-off-by: Andy Shevchenko Reviewed-by: Sakari Ailus --- include/linux/property.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 117cc200c656..67371c963134 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -50,7 +50,6 @@ int device_property_read_string(struct device *dev, const char *propname, int device_property_match_string(struct device *dev, const char *propname, const char *string); -bool fwnode_device_is_available(const struct fwnode_handle *fwnode); bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname); int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, @@ -72,6 +71,15 @@ int fwnode_property_read_string(const struct fwnode_handle *fwnode, const char *propname, const char **val); int fwnode_property_match_string(const struct fwnode_handle *fwnode, const char *propname, const char *string); + +bool fwnode_device_is_available(const struct fwnode_handle *fwnode); + +static inline +bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat) +{ + return fwnode_property_match_string(fwnode, "compatible", compat) >= 0; +} + int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, -- cgit From d08b0f8f46e45a274fc8c9a5bc92cb9da70d9887 Mon Sep 17 00:00:00 2001 From: Shane Parslow Date: Sat, 29 Oct 2022 02:03:56 -0700 Subject: net: wwan: iosm: add rpc interface for xmm modems Add a new iosm wwan port that connects to the modem rpc interface. This interface provides a configuration channel, and in the case of the 7360, is the only way to configure the modem (as it does not support mbim). The new interface is compatible with existing software, such as open_xdatachannel.py from the xmm7360-pci project [1]. [1] https://github.com/xmm7360/xmm7360-pci Signed-off-by: Shane Parslow Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 5ce2acf444fb..24d76500b1cc 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -15,6 +15,7 @@ * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface * @WWAN_PORT_FIREHOSE: XML based command protocol + * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems * * @WWAN_PORT_MAX: Highest supported port types * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type @@ -26,6 +27,7 @@ enum wwan_port_type { WWAN_PORT_QMI, WWAN_PORT_QCDM, WWAN_PORT_FIREHOSE, + WWAN_PORT_XMMRPC, /* Add new port types above this line */ -- cgit From 89aed3cd5cb951113b766cddd9c2df43cfbdafd5 Mon Sep 17 00:00:00 2001 From: Benedikt Niedermayr Date: Wed, 2 Nov 2022 14:30:46 +0100 Subject: memory: omap-gpmc: wait pin additions This patch introduces support for setting the wait-pin polarity as well as using the same wait-pin for different CS regions. The waitpin polarity can be configured via the WAITPINPOLARITY bits in the GPMC_CONFIG register. This is currently not supported by the driver. This patch adds support for setting the required register bits with the "ti,wait-pin-polarity" dt-property. The wait-pin can also be shared between different CS regions for special usecases. Therefore GPMC must keep track of wait-pin allocations, so it knows that either GPMC itself or another driver has the ownership. Signed-off-by: Benedikt Niedermayr Link: https://lore.kernel.org/r/20221102133047.1654449-2-benedikt.niedermayr@siemens.com Reviewed-by: Roger Quadros Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/gpmc-omap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h index c9cc4e32435d..296b080c5c67 100644 --- a/include/linux/platform_data/gpmc-omap.h +++ b/include/linux/platform_data/gpmc-omap.h @@ -136,6 +136,13 @@ struct gpmc_device_timings { #define GPMC_MUX_AAD 1 /* Addr-Addr-Data multiplex */ #define GPMC_MUX_AD 2 /* Addr-Data multiplex */ +/* Wait pin polarity values */ +#define GPMC_WAITPINPOLARITY_INVALID -1 +#define GPMC_WAITPINPOLARITY_ACTIVE_LOW 0 +#define GPMC_WAITPINPOLARITY_ACTIVE_HIGH 1 + +#define GPMC_WAITPIN_INVALID -1 + struct gpmc_settings { bool burst_wrap; /* enables wrap bursting */ bool burst_read; /* enables read page/burst mode */ @@ -149,6 +156,7 @@ struct gpmc_settings { u32 device_width; /* device bus width (8 or 16 bit) */ u32 mux_add_data; /* multiplex address & data */ u32 wait_pin; /* wait-pin to be used */ + u32 wait_pin_polarity; }; /* Data for each chip select */ -- cgit From 80bd4a7aab4c9ce59bf5e35fdf52aa23d8a3c9f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:47 +0100 Subject: blk-mq: move the srcu_struct used for quiescing to the tagset All I/O submissions have fairly similar latencies, and a tagset-wide quiesce is a fairly common operation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-12-hch@lst.de [axboe: fix whitespace] Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 9 --------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 569053ed959d..f059edebb11d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -7,6 +7,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -500,6 +501,8 @@ enum hctx_type { * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. + * @srcu: Use as lock when type of the request queue is blocking + * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { struct blk_mq_queue_map map[HCTX_MAX_TYPES]; @@ -520,6 +523,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; + struct srcu_struct *srcu; }; /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32137d85c9ad..6a6fa167fc82 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -543,18 +542,11 @@ struct request_queue { struct mutex debugfs_mutex; bool mq_sysfs_init_done; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the request queue - * is blocking (BLK_MQ_F_BLOCKING). Must be the last member - */ - struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -590,7 +582,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) -#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ -- cgit From 483239c75ba768e0e2c0e0c503e5fc13c3d5773a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:48 +0100 Subject: blk-mq: pass a tagset to blk_mq_wait_quiesce_done Nothing in blk_mq_wait_quiesce_done needs the request_queue now, so just pass the tagset, and move the non-mq check into the only caller that needs it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-13-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f059edebb11d..061ea6e7af01 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -880,7 +880,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); -void blk_mq_wait_quiesce_done(struct request_queue *q); +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -- cgit From 414dd48e882c5a39e7bd01b096ee6497eb3314b0 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Tue, 1 Nov 2022 16:00:49 +0100 Subject: blk-mq: add tagset quiesce interface Drivers that have shared tagsets may need to quiesce potentially a lot of request queues that all share a single tagset (e.g. nvme). Add an interface to quiesce all the queues on a given tagset. This interface is useful because it can speedup the quiesce by doing it in parallel. Because some queues should not need to be quiesced (e.g. the nvme connect_q) when quiescing the tagset, introduce a QUEUE_FLAG_SKIP_TAGSET_QUIESCE flag to allow this new interface to ski quiescing a particular queue. Signed-off-by: Chao Leng [hch: simplify for the per-tag_set srcu_struct] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 061ea6e7af01..109a0e30c470 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -881,6 +881,8 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a6fa167fc82..9188aa3f6259 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,7 @@ struct request_queue { #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ #define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ (1UL << QUEUE_FLAG_SAME_COMP) | \ @@ -610,6 +611,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) +#define blk_queue_skip_tagset_quiesce(q) \ + test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit From 4b21d25bf519c9487935a664886956bb18f04f6d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Oct 2022 23:11:25 +0300 Subject: overflow: Introduce overflows_type() and castable_to_type() Implement a robust overflows_type() macro to test if a variable or constant value would overflow another variable or type. This can be used as a constant expression for static_assert() (which requires a constant expression[1][2]) when used on constant values. This must be constructed manually, since __builtin_add_overflow() does not produce a constant expression[3]. Additionally adds castable_to_type(), similar to __same_type(), but for checking if a constant value would overflow if cast to a given type. Add unit tests for overflows_type(), __same_type(), and castable_to_type() to the existing KUnit "overflow" test: [16:03:33] ================== overflow (21 subtests) ================== ... [16:03:33] [PASSED] overflows_type_test [16:03:33] [PASSED] same_type_test [16:03:33] [PASSED] castable_to_type_test [16:03:33] ==================== [PASSED] overflow ===================== [16:03:33] ============================================================ [16:03:33] Testing complete. Ran 21 tests: passed: 21 [16:03:33] Elapsed time: 24.022s total, 0.002s configuring, 22.598s building, 0.767s running [1] https://en.cppreference.com/w/c/language/_Static_assert [2] C11 standard (ISO/IEC 9899:2011): 6.7.10 Static assertions [3] https://gcc.gnu.org/onlinedocs/gcc/Integer-Overflow-Builtins.html 6.56 Built-in Functions to Perform Arithmetic with Overflow Checking Built-in Function: bool __builtin_add_overflow (type1 a, type2 b, Cc: Luc Van Oostenryck Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Daniel Latypov Cc: Vitor Massaru Iha Cc: "Gustavo A. R. Silva" Cc: Jani Nikula Cc: Mauro Carvalho Chehab Cc: linux-hardening@vger.kernel.org Cc: llvm@lists.linux.dev Co-developed-by: Gwan-gyeong Mun Signed-off-by: Gwan-gyeong Mun Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221024201125.1416422-1-gwan-gyeong.mun@intel.com --- include/linux/compiler.h | 1 + include/linux/overflow.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 973a1bfd7ef5..947a60b801db 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -236,6 +236,7 @@ static inline void *offset_to_ptr(const int *off) * bool and also pointer types. */ #define is_signed_type(type) (((type)(-1)) < (__force type)1) +#define is_unsigned_type(type) (!is_signed_type(type)) /* * This is needed in functions which generate the stack canary, see diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 1d3be1a2204c..0e33b5cbdb9f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -128,6 +128,53 @@ static inline bool __must_check __must_check_overflow(bool overflow) (*_d >> _to_shift) != _a); \ })) +#define __overflows_type_constexpr(x, T) ( \ + is_unsigned_type(typeof(x)) ? \ + (x) > type_max(typeof(T)) : \ + is_unsigned_type(typeof(T)) ? \ + (x) < 0 || (x) > type_max(typeof(T)) : \ + (x) < type_min(typeof(T)) || (x) > type_max(typeof(T))) + +#define __overflows_type(x, T) ({ \ + typeof(T) v = 0; \ + check_add_overflow((x), v, &v); \ +}) + +/** + * overflows_type - helper for checking the overflows between value, variables, + * or data type + * + * @n: source constant value or variable to be checked + * @T: destination variable or data type proposed to store @x + * + * Compares the @x expression for whether or not it can safely fit in + * the storage of the type in @T. @x and @T can have different types. + * If @x is a constant expression, this will also resolve to a constant + * expression. + * + * Returns: true if overflow can occur, false otherwise. + */ +#define overflows_type(n, T) \ + __builtin_choose_expr(__is_constexpr(n), \ + __overflows_type_constexpr(n, T), \ + __overflows_type(n, T)) + +/** + * castable_to_type - like __same_type(), but also allows for casted literals + * + * @n: variable or constant value + * @T: variable or data type + * + * Unlike the __same_type() macro, this allows a constant value as the + * first argument. If this value would not overflow into an assignment + * of the second argument's type, it returns true. Otherwise, this falls + * back to __same_type(). + */ +#define castable_to_type(n, T) \ + __builtin_choose_expr(__is_constexpr(n), \ + !__overflows_type_constexpr(n, T), \ + __same_type(n, T)) + /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX * @factor1: first factor -- cgit From 8275b48b278096edc1e3ea5aa9cf946a10022f79 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 4 Oct 2022 12:49:25 +0200 Subject: tty: serial: introduce transmit helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many serial drivers do the same thing: * send x_char if set * keep sending from the xmit circular buffer until either - the loop reaches the end of the xmit buffer - TX is stopped - HW fifo is full * check for pending characters and: - wake up tty writers to fill for more data into xmit buffer - stop TX if there is nothing in the xmit buffer The only differences are: * how to write the character to the HW fifo * the check of the end condition: - is the HW fifo full? - is limit of the written characters reached? So unify the above into two helpers: * uart_port_tx_limited() -- it performs the above taking the written characters limit into account, and * uart_port_tx() -- the same as above, except it only checks the HW readiness, not the characters limit. The HW specific operations (as stated as "differences" above) are passed as arguments to the macros. They are: * tx_ready -- returns true if HW can accept more data. * put_char -- write a character to the device. * tx_done -- when the write loop is done, perform arbitrary action before potential invocation of ops->stop_tx() happens. Note that the above are macros. This means the code is generated in place and the above 3 arguments are "inlined". I.e. no added penalty by generating call instructions for every single character. Nor any indirect calls. (As in some previous versions of this patchset.) Reviewed-by: Ilpo Järvinen Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20221004104927.14361-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 80 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index d657f2a42a7b..dbbc4408bb19 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -664,6 +664,86 @@ struct uart_driver { void uart_write_wakeup(struct uart_port *port); +#define __uart_port_tx(uport, ch, tx_ready, put_char, tx_done, for_test, \ + for_post) \ +({ \ + struct uart_port *__port = (uport); \ + struct circ_buf *xmit = &__port->state->xmit; \ + unsigned int pending; \ + \ + for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) { \ + if (__port->x_char) { \ + (ch) = __port->x_char; \ + (put_char); \ + __port->x_char = 0; \ + continue; \ + } \ + \ + if (uart_circ_empty(xmit) || uart_tx_stopped(__port)) \ + break; \ + \ + (ch) = xmit->buf[xmit->tail]; \ + (put_char); \ + xmit->tail = (xmit->tail + 1) % UART_XMIT_SIZE; \ + } \ + \ + (tx_done); \ + \ + pending = uart_circ_chars_pending(xmit); \ + if (pending < WAKEUP_CHARS) { \ + uart_write_wakeup(__port); \ + \ + if (pending == 0) \ + __port->ops->stop_tx(__port); \ + } \ + \ + pending; \ +}) + +/** + * uart_port_tx_limited -- transmit helper for uart_port with count limiting + * @port: uart port + * @ch: variable to store a character to be written to the HW + * @count: a limit of characters to send + * @tx_ready: can HW accept more data function + * @put_char: function to write a character + * @tx_done: function to call after the loop is done + * + * This helper transmits characters from the xmit buffer to the hardware using + * @put_char(). It does so until @count characters are sent and while @tx_ready + * evaluates to true. + * + * Returns: the number of characters in the xmit buffer when done. + * + * The expression in macro parameters shall be designed as follows: + * * **tx_ready:** should evaluate to true if the HW can accept more data to + * be sent. This parameter can be %true, which means the HW is always ready. + * * **put_char:** shall write @ch to the device of @port. + * * **tx_done:** when the write loop is done, this can perform arbitrary + * action before potential invocation of ops->stop_tx() happens. If the + * driver does not need to do anything, use e.g. ({}). + * + * For all of them, @port->lock is held, interrupts are locally disabled and + * the expressions must not sleep. + */ +#define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \ + unsigned int __count = (count); \ + __uart_port_tx(port, ch, tx_ready, put_char, tx_done, __count, \ + __count--); \ +}) + +/** + * uart_port_tx -- transmit helper for uart_port + * @port: uart port + * @ch: variable to store a character to be written to the HW + * @tx_ready: can HW accept more data function + * @put_char: function to write a character + * + * See uart_port_tx_limited() for more details. + */ +#define uart_port_tx(port, ch, tx_ready, put_char) \ + __uart_port_tx(port, ch, tx_ready, put_char, ({}), true, ({})) + /* * Baud rate helpers. */ -- cgit From fd1845069711cdf1b1aaaa0f22311b7736396331 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 31 Oct 2022 16:37:02 -0700 Subject: regulator: devres: Add devm_regulator_bulk_get_exclusive() We had an exclusive variant of the devm_regulator_get() API, but no corresponding variant for the bulk API; let's add one now. We add a generalized version of the existing regulator_bulk_get() function that additionally takes a get_type parameter and redefine regulator_bulk_get() in terms of it, then do similarly with devm_regulator_bulk_get(), and finally add the new devm_regulator_bulk_get_exclusive(). Signed-off-by: Zev Weiss Link: https://lore.kernel.org/r/20221031233704.22575-2-zev@bewilderbeest.net Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index ee3b4a014611..628a52b8e63f 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -247,6 +247,8 @@ int __must_check regulator_bulk_get(struct device *dev, int num_consumers, int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); +int __must_check devm_regulator_bulk_get_exclusive(struct device *dev, int num_consumers, + struct regulator_bulk_data *consumers); int __must_check devm_regulator_bulk_get_const( struct device *dev, int num_consumers, const struct regulator_bulk_data *in_consumers, -- cgit From 5c51d4afcf3fd36159713556402e16cfab794ae9 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 31 Oct 2022 16:37:04 -0700 Subject: regulator: userspace-consumer: Handle regulator-output DT nodes In addition to adding some fairly simple OF support code, we make some slight adjustments to the userspace-consumer driver to properly support use with regulator-output hardware: - We now do an exclusive get of the supply regulators so as to prevent regulator_init_complete_work from automatically disabling them. - Instead of assuming that the supply is initially disabled, we now query its state to determine the initial value of drvdata->enabled. Signed-off-by: Zev Weiss Link: https://lore.kernel.org/r/20221031233704.22575-4-zev@bewilderbeest.net Signed-off-by: Mark Brown --- include/linux/regulator/userspace-consumer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/regulator/userspace-consumer.h b/include/linux/regulator/userspace-consumer.h index b5dba0628951..2249ee697f8b 100644 --- a/include/linux/regulator/userspace-consumer.h +++ b/include/linux/regulator/userspace-consumer.h @@ -21,6 +21,7 @@ struct regulator_userspace_consumer_data { struct regulator_bulk_data *supplies; bool init_on; + bool no_autoswitch; }; #endif /* __REGULATOR_PLATFORM_CONSUMER_H_ */ -- cgit From 1adf3cc20d693569ebee90fd91fa34b0570fcd6f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:05 +0800 Subject: iommu: Add max_pasids field in struct iommu_device Use this field to keep the number of supported PASIDs that an IOMMU hardware is able to support. This is a generic attribute of an IOMMU and lifting it into the per-IOMMU device structure makes it possible to allocate a PASID for device without calls into the IOMMU drivers. Any iommu driver that supports PASID related features should set this field before enabling them on the devices. In the Intel IOMMU driver, intel_iommu_sm is moved to CONFIG_INTEL_IOMMU enclave so that the pasid_supported() helper could be used in dmar.c without compilation errors. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e..e3af4f46e6e0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -322,12 +322,14 @@ struct iommu_domain_ops { * @list: Used by the iommu-core to keep a list of registered iommus * @ops: iommu-ops for talking to this iommu * @dev: struct device for sysfs handling + * @max_pasids: number of supported PASIDs */ struct iommu_device { struct list_head list; const struct iommu_ops *ops; struct fwnode_handle *fwnode; struct device *dev; + u32 max_pasids; }; /** -- cgit From 22d2c7afb3697a68c7fc05c935ef662dee06dc60 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:06 +0800 Subject: iommu: Add max_pasids field in struct dev_iommu Use this field to save the number of PASIDs that a device is able to consume. It is a generic attribute of a device and lifting it into the per-device dev_iommu struct could help to avoid the boilerplate code in various IOMMU drivers. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e3af4f46e6e0..ac3f6c6dcc6d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -368,6 +368,7 @@ struct iommu_fault_param { * @fwspec: IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data + * @max_pasids: number of PASIDs this device can consume * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. * struct iommu_group *iommu_group; @@ -379,6 +380,7 @@ struct dev_iommu { struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; + u32 max_pasids; }; int iommu_device_register(struct iommu_device *iommu, -- cgit From 942fd5435dccb273f90176b046ae6bbba60cfbd8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:07 +0800 Subject: iommu: Remove SVM_FLAG_SUPERVISOR_MODE support The current kernel DMA with PASID support is based on the SVA with a flag SVM_FLAG_SUPERVISOR_MODE. The IOMMU driver binds the kernel memory address space to a PASID of the device. The device driver programs the device with kernel virtual address (KVA) for DMA access. There have been security and functional issues with this approach: - The lack of IOTLB synchronization upon kernel page table updates. (vmalloc, module/BPF loading, CONFIG_DEBUG_PAGEALLOC etc.) - Other than slight more protection, using kernel virtual address (KVA) has little advantage over physical address. There are also no use cases yet where DMA engines need kernel virtual addresses for in-kernel DMA. This removes SVM_FLAG_SUPERVISOR_MODE support from the IOMMU interface. The device drivers are suggested to handle kernel DMA with PASID through the kernel DMA APIs. The drvdata parameter in iommu_sva_bind_device() and all callbacks is not needed anymore. Cleanup them as well. Link: https://lore.kernel.org/linux-iommu/20210511194726.GP1002214@nvidia.com/ Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Fenghua Yu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 13 ------------- include/linux/iommu.h | 8 +++----- 2 files changed, 3 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 207ef06ba3e1..f9a0d44f6fdb 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -13,17 +13,4 @@ #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) #define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) -/* - * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only - * for access to kernel addresses. No IOTLB flushes are automatically done - * for kernel mappings; it is valid only for access to the kernel's static - * 1:1 mapping of physical memory — not to vmalloc or even module mappings. - * A future API addition may permit the use of such ranges, by means of an - * explicit IOTLB flush call (akin to the DMA API's unmap method). - * - * It is unlikely that we will ever hook into flush_tlb_kernel_range() to - * do such IOTLB flushes automatically. - */ -#define SVM_FLAG_SUPERVISOR_MODE BIT(0) - #endif /* __INTEL_SVM_H__ */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ac3f6c6dcc6d..72bb0531aa76 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -247,8 +247,7 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm, - void *drvdata); + struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm); void (*sva_unbind)(struct iommu_sva *handle); u32 (*sva_get_pasid)(struct iommu_sva *handle); @@ -668,8 +667,7 @@ int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); struct iommu_sva *iommu_sva_bind_device(struct device *dev, - struct mm_struct *mm, - void *drvdata); + struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); @@ -1000,7 +998,7 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } static inline struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { return NULL; } -- cgit From 16603704559c7a68718059c4f75287886c01b20f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:09 +0800 Subject: iommu: Add attach/detach_dev_pasid iommu interfaces Attaching an IOMMU domain to a PASID of a device is a generic operation for modern IOMMU drivers which support PASID-granular DMA address translation. Currently visible usage scenarios include (but not limited): - SVA (Shared Virtual Address) - kernel DMA with PASID - hardware-assist mediated device This adds the set_dev_pasid domain ops for setting the domain onto a PASID of a device and remove_dev_pasid iommu ops for removing any setup on a PASID of device. This also adds interfaces for device drivers to attach/detach/retrieve a domain for a PASID of a device. If multiple devices share a single group, it's fine as long the fabric always routes every TLP marked with a PASID to the host bridge and only the host bridge. For example, ACS achieves this universally and has been checked when pci_enable_pasid() is called. As we can't reliably tell the source apart in a group, all the devices in a group have to be considered as the same source, and mapped to the same PASID table. The DMA ownership is about the whole device (more precisely, iommu group), including the RID and PASIDs. When the ownership is converted, the pasid array must be empty. This also adds necessary checks in the DMA ownership interfaces. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 72bb0531aa76..5d2b78ac5416 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,9 @@ struct iommu_iotlb_gather { * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains + * @remove_dev_pasid: Remove any translation configurations of a specific + * pasid, so that any DMA transactions with this pasid + * will be blocked by the hardware. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -256,6 +259,7 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); + void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; @@ -266,6 +270,7 @@ struct iommu_ops { * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device * @detach_dev: detach an iommu domain from a device + * @set_dev_pasid: set an iommu domain to a pasid of device * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. @@ -286,6 +291,8 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); @@ -678,6 +685,13 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void iommu_detach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +struct iommu_domain * +iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, + unsigned int type); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1040,6 +1054,24 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) { return false; } + +static inline int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + return -ENODEV; +} + +static inline void iommu_detach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ +} + +static inline struct iommu_domain * +iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, + unsigned int type) +{ + return NULL; +} #endif /* CONFIG_IOMMU_API */ /** -- cgit From 136467962e49931dbc6240aea8197fab7e407ba4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:10 +0800 Subject: iommu: Add IOMMU SVA domain support The SVA iommu_domain represents a hardware pagetable that the IOMMU hardware could use for SVA translation. This adds some infrastructures to support SVA domain in the iommu core. It includes: - Extend the iommu_domain to support a new IOMMU_DOMAIN_SVA domain type. The IOMMU drivers that support allocation of the SVA domain should provide its own SVA domain specific iommu_domain_ops. - Add a helper to allocate an SVA domain. The iommu_domain_free() is still used to free an SVA domain. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Leave the existing fault handler with the existing users and the newly added SVA members excludes it. Suggested-by: Jean-Philippe Brucker Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5d2b78ac5416..776baa375967 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -64,6 +64,8 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */ #define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */ +#define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ + /* * This are the possible domain-types * @@ -77,6 +79,8 @@ struct iommu_domain_geometry { * certain optimizations for these domains * IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB * invalidation. + * IOMMU_DOMAIN_SVA - DMA addresses are shared process addresses + * represented by mm_struct's. */ #define IOMMU_DOMAIN_BLOCKED (0U) #define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT) @@ -86,15 +90,24 @@ struct iommu_domain_geometry { #define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \ __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) +#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct { + iommu_fault_handler_t handler; + void *handler_token; + }; + struct { /* IOMMU_DOMAIN_SVA */ + struct mm_struct *mm; + int users; + }; + }; }; static inline bool iommu_is_dma_domain(struct iommu_domain *domain) @@ -685,6 +698,8 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, @@ -1055,6 +1070,12 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) return false; } +static inline struct iommu_domain * +iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} + static inline int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { -- cgit From be51b1d6bbff48c7d1943a8ff1e5a55777807f6e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:13 +0800 Subject: iommu/sva: Refactoring iommu_sva_bind/unbind_device() The existing iommu SVA interfaces are implemented by calling the SVA specific iommu ops provided by the IOMMU drivers. There's no need for any SVA specific ops in iommu_ops vector anymore as we can achieve this through the generic attach/detach_dev_pasid domain ops. This refactors the IOMMU SVA interfaces implementation by using the iommu_attach/detach_device_pasid interfaces and align them with the concept of the SVA iommu domain. Put the new SVA code in the SVA related file in order to make it self-contained. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 776baa375967..bee5659d07eb 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -645,6 +645,7 @@ struct iommu_fwspec { */ struct iommu_sva { struct device *dev; + struct iommu_domain *domain; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, @@ -686,11 +687,6 @@ void iommu_release_device(struct device *dev); int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); -struct iommu_sva *iommu_sva_bind_device(struct device *dev, - struct mm_struct *mm); -void iommu_sva_unbind_device(struct iommu_sva *handle); -u32 iommu_sva_get_pasid(struct iommu_sva *handle); - int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); @@ -1026,21 +1022,6 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) return -ENODEV; } -static inline struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) -{ - return NULL; -} - -static inline void iommu_sva_unbind_device(struct iommu_sva *handle) -{ -} - -static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) -{ - return IOMMU_PASID_INVALID; -} - static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; @@ -1154,4 +1135,26 @@ static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_m #endif /* CONFIG_IOMMU_DMA */ +#ifdef CONFIG_IOMMU_SVA +struct iommu_sva *iommu_sva_bind_device(struct device *dev, + struct mm_struct *mm); +void iommu_sva_unbind_device(struct iommu_sva *handle); +u32 iommu_sva_get_pasid(struct iommu_sva *handle); +#else +static inline struct iommu_sva * +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} + +static inline void iommu_sva_unbind_device(struct iommu_sva *handle) +{ +} + +static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + return IOMMU_PASID_INVALID; +} +#endif /* CONFIG_IOMMU_SVA */ + #endif /* __LINUX_IOMMU_H */ -- cgit From 1c263576f4735e063e234fa5f43fd3046d36b5b3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:14 +0800 Subject: iommu: Remove SVA related callbacks from iommu ops These ops'es have been deprecated. There's no need for them anymore. Remove them to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bee5659d07eb..c337ef1c97bc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -227,9 +227,6 @@ struct iommu_iotlb_gather { * driver init to device driver init (default no) * @dev_enable/disable_feat: per device entries to enable/disable * iommu specific features. - * @sva_bind: Bind process address space to device - * @sva_unbind: Unbind process address space from device - * @sva_get_pasid: Get PASID associated to a SVA handle * @page_response: handle page request response * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain @@ -263,10 +260,6 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm); - void (*sva_unbind)(struct iommu_sva *handle); - u32 (*sva_get_pasid)(struct iommu_sva *handle); - int (*page_response)(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); -- cgit From 8cc93159f91960b4812ea48887e9e7501babc95a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:15 +0800 Subject: iommu: Prepare IOMMU domain for IOPF This adds some mechanisms around the iommu_domain so that the I/O page fault handling framework could route a page fault to the domain and call the fault handler from it. Add pointers to the page fault handler and its private data in struct iommu_domain. The fault handler will be called with the private data as a parameter once a page fault is routed to the domain. Any kernel component which owns an iommu domain could install handler and its private parameter so that the page fault could be further routed and handled. This also prepares the SVA implementation to be the first consumer of the per-domain page fault handling model. The I/O page fault handler for SVA is copied to the SVA file with mmget_not_zero() added before mmap_read_lock(). Suggested-by: Jean-Philippe Brucker Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c337ef1c97bc..7d2648058e43 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -98,6 +98,9 @@ struct iommu_domain { unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault, + void *data); + void *fault_data; union { struct { iommu_fault_handler_t handler; -- cgit From 23da464dd6b8935b66f4ee306ad8947fd32ccd75 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:51 +0530 Subject: bpf: Allow specifying volatile type modifier for kptrs This is useful in particular to mark the pointer as volatile, so that compiler treats each load and store to the field as a volatile access. The alternative is having to define and use READ_ONCE and WRITE_ONCE in the BPF program. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20221103191013.1236066-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..86aad9b2ce02 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -288,6 +288,11 @@ static inline bool btf_type_is_typedef(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; } +static inline bool btf_type_is_volatile(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE; +} + static inline bool btf_type_is_func(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -- cgit From a35ec8e38cdd1766f29924ca391a01de20163931 Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 1 Nov 2022 21:39:21 +0200 Subject: bridge: Add MAC Authentication Bypass (MAB) support Hosts that support 802.1X authentication are able to authenticate themselves by exchanging EAPOL frames with an authenticator (Ethernet bridge, in this case) and an authentication server. Access to the network is only granted by the authenticator to successfully authenticated hosts. The above is implemented in the bridge using the "locked" bridge port option. When enabled, link-local frames (e.g., EAPOL) can be locally received by the bridge, but all other frames are dropped unless the host is authenticated. That is, unless the user space control plane installed an FDB entry according to which the source address of the frame is located behind the locked ingress port. The entry can be dynamic, in which case learning needs to be enabled so that the entry will be refreshed by incoming traffic. There are deployments in which not all the devices connected to the authenticator (the bridge) support 802.1X. Such devices can include printers and cameras. One option to support such deployments is to unlock the bridge ports connecting these devices, but a slightly more secure option is to use MAB. When MAB is enabled, the MAC address of the connected device is used as the user name and password for the authentication. For MAB to work, the user space control plane needs to be notified about MAC addresses that are trying to gain access so that they will be compared against an allow list. This can be implemented via the regular learning process with the sole difference that learned FDB entries are installed with a new "locked" flag indicating that the entry cannot be used to authenticate the device. The flag cannot be set by user space, but user space can clear the flag by replacing the entry, thereby authenticating the device. Locked FDB entries implement the following semantics with regards to roaming, aging and forwarding: 1. Roaming: Locked FDB entries can roam to unlocked (authorized) ports, in which case the "locked" flag is cleared. FDB entries cannot roam to locked ports regardless of MAB being enabled or not. Therefore, locked FDB entries are only created if an FDB entry with the given {MAC, VID} does not already exist. This behavior prevents unauthenticated devices from disrupting traffic destined to already authenticated devices. 2. Aging: Locked FDB entries age and refresh by incoming traffic like regular entries. 3. Forwarding: Locked FDB entries forward traffic like regular entries. If user space detects an unauthorized MAC behind a locked port and wishes to prevent traffic with this MAC DA from reaching the host, it can do so using tc or a different mechanism. Enable the above behavior using a new bridge port option called "mab". It can only be enabled on a bridge port that is both locked and has learning enabled. Locked FDB entries are flushed from the port once MAB is disabled. A new option is added because there are pure 802.1X deployments that are not interested in notifications about locked FDB entries. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index d62ef428e3aa..1668ac4d7adc 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -59,6 +59,7 @@ struct br_ip_list { #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) #define BR_PORT_LOCKED BIT(21) +#define BR_PORT_MAB BIT(22) #define BR_DEFAULT_AGEING_TIME (300 * HZ) -- cgit From 02a68a47eadedf95748facfca6ced31fb0181d52 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:03 +0100 Subject: net: devlink: track netdev with devlink_port assigned Currently, ethernet drivers are using devlink_port_type_eth_set() and devlink_port_type_clear() to set devlink port type and link to related netdev. Instead of calling them directly, let the driver use SET_NETDEV_DEVLINK_PORT macro to assign devlink_port pointer and let devlink to track it. Note the devlink port pointer is static during the time netdevice is registered. In devlink code, use per-namespace netdev notifier to track the netdevices with devlink_port assigned and change the internal devlink_port type and related type pointer accordingly. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4b5052db978f..f048a30ea10b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1999,6 +1999,11 @@ enum netdev_ml_priv_type { * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * + * @devlink_port: Pointer to related devlink port structure. + * Assigned by a driver before netdev registration using + * SET_NETDEV_DEVLINK_PORT macro. This pointer is static + * during the time netdevice is registered. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2349,9 +2354,22 @@ struct net_device { netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; + + struct devlink_port *devlink_port; }; #define to_net_dev(d) container_of(d, struct net_device, dev) +/* + * Driver should use this to assign devlink port instance to a netdevice + * before it registers the netdevice. Therefore devlink_port is static + * during the netdev lifetime after it is registered. + */ +#define SET_NETDEV_DEVLINK_PORT(dev, port) \ +({ \ + WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ + ((dev)->devlink_port = (port)); \ +}) + static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) @@ -2785,6 +2803,7 @@ enum netdev_cmd { NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, + NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, -- cgit From 77df1db80da384c565106321f5934967690da7dd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:10 +0100 Subject: net: remove unused ndo_get_devlink_port Remove ndo_get_devlink_port which is no longer used alongside with the implementations in drivers. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f048a30ea10b..d45713a06568 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1366,10 +1366,6 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev); - * Get devlink port instance associated with a given netdev. - * Called with a reference on the netdevice and devlink locks only, - * rtnl_lock is not held. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. @@ -1600,7 +1596,6 @@ struct net_device_ops { struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); - struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); -- cgit From aa3496accc412b3d975e4ee5d06076d73394d8b5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:55 +0530 Subject: bpf: Refactor kptr_off_tab into btf_record To prepare the BPF verifier to handle special fields in both map values and program allocated types coming from program BTF, we need to refactor the kptr_off_tab handling code into something more generic and reusable across both cases to avoid code duplication. Later patches also require passing this data to helpers at runtime, so that they can work on user defined types, initialize them, destruct them, etc. The main observation is that both map values and such allocated types point to a type in program BTF, hence they can be handled similarly. We can prepare a field metadata table for both cases and store them in struct bpf_map or struct btf depending on the use case. Hence, refactor the code into generic btf_record and btf_field member structs. The btf_record represents the fields of a specific btf_type in user BTF. The cnt indicates the number of special fields we successfully recognized, and field_mask is a bitmask of fields that were found, to enable quick determination of availability of a certain field. Subsequently, refactor the rest of the code to work with these generic types, remove assumptions about kptr and kptr_off_tab, rename variables to more meaningful names, etc. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 125 ++++++++++++++++++++++++++++++++++------------------ include/linux/btf.h | 3 +- 2 files changed, 82 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8d948bfcb984..5f2a42033a37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,35 +165,41 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BPF map value */ - BPF_MAP_VALUE_OFF_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + /* Support at most 8 pointers in a BTF type */ + BTF_FIELDS_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + 1 + /* for bpf_spin_lock */ 1, /* for bpf_timer */ }; -enum bpf_kptr_type { - BPF_KPTR_UNREF, - BPF_KPTR_REF, +enum btf_field_type { + BPF_KPTR_UNREF = (1 << 2), + BPF_KPTR_REF = (1 << 3), + BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, }; -struct bpf_map_value_off_desc { +struct btf_field_kptr { + struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; + u32 btf_id; +}; + +struct btf_field { u32 offset; - enum bpf_kptr_type type; - struct { - struct btf *btf; - struct module *module; - btf_dtor_kfunc_t dtor; - u32 btf_id; - } kptr; + enum btf_field_type type; + union { + struct btf_field_kptr kptr; + }; }; -struct bpf_map_value_off { - u32 nr_off; - struct bpf_map_value_off_desc off[]; +struct btf_record { + u32 cnt; + u32 field_mask; + struct btf_field fields[]; }; -struct bpf_map_off_arr { +struct btf_field_offs { u32 cnt; u32 field_off[BPF_MAP_OFF_ARR_MAX]; u8 field_sz[BPF_MAP_OFF_ARR_MAX]; @@ -215,7 +221,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ - struct bpf_map_value_off *kptr_off_tab; + struct btf_record *record; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -227,7 +233,7 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct bpf_map_off_arr *off_arr; + struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -251,6 +257,37 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline u32 btf_field_type_size(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return sizeof(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline u32 btf_field_type_align(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return __alignof__(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) +{ + if (IS_ERR_OR_NULL(rec)) + return false; + return rec->field_mask & type; +} + static inline bool map_value_has_spin_lock(const struct bpf_map *map) { return map->spin_lock_off >= 0; @@ -261,23 +298,19 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } -static inline bool map_value_has_kptrs(const struct bpf_map *map) -{ - return !IS_ERR_OR_NULL(map->kptr_off_tab); -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); - if (unlikely(map_value_has_kptrs(map))) { - struct bpf_map_value_off *tab = map->kptr_off_tab; + if (!IS_ERR_OR_NULL(map->record)) { + struct btf_field *fields = map->record->fields; + u32 cnt = map->record->cnt; int i; - for (i = 0; i < tab->nr_off; i++) - *(u64 *)(dst + tab->off[i].offset) = 0; + for (i = 0; i < cnt; i++) + memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); } } @@ -303,7 +336,7 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); else @@ -311,11 +344,12 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memcpy(dst + curr_off, src + curr_off, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memcpy(dst + curr_off, src + curr_off, sz); + curr_off += map->field_offs->field_sz[i]; } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } @@ -335,16 +369,17 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { memset(dst, 0, map->value_size); return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memset(dst + curr_off, 0, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memset(dst + curr_off, 0, sz); + curr_off += map->field_offs->field_sz[i]; } memset(dst + curr_off, 0, map->value_size - curr_off); } @@ -1699,11 +1734,13 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); -void bpf_map_free_kptr_off_tab(struct bpf_map *map); -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); +struct btf_field *btf_record_find(const struct btf_record *rec, + u32 offset, enum btf_field_type type); +void btf_record_free(struct btf_record *rec); +void bpf_map_free_record(struct bpf_map *map); +struct btf_record *btf_record_dup(const struct btf_record *rec); +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 86aad9b2ce02..9e62717cdc7a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,8 +163,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From db559117828d2448fe81ada051c60bcf39f822e9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:56 +0530 Subject: bpf: Consolidate spin_lock, timer management into btf_record Now that kptr_off_tab has been refactored into btf_record, and can hold more than one specific field type, accomodate bpf_spin_lock and bpf_timer as well. While they don't require any more metadata than offset, having all special fields in one place allows us to share the same code for allocated user defined types and handle both map values and these allocated objects in a similar fashion. As an optimization, we still keep spin_lock_off and timer_off offsets in the btf_record structure, just to avoid having to find the btf_field struct each time their offset is needed. This is mostly needed to manipulate such objects in a map value at runtime. It's ok to hardcode just one offset as more than one field is disallowed. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 53 ++++++++++++++++++++++++++++++++--------------------- include/linux/btf.h | 3 ++- 2 files changed, 34 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5f2a42033a37..aae85019abde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -166,13 +166,13 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + - 1 + /* for bpf_spin_lock */ - 1, /* for bpf_timer */ + BTF_FIELDS_MAX = 10, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, }; enum btf_field_type { + BPF_SPIN_LOCK = (1 << 0), + BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, @@ -196,6 +196,8 @@ struct btf_field { struct btf_record { u32 cnt; u32 field_mask; + int spin_lock_off; + int timer_off; struct btf_field fields[]; }; @@ -220,10 +222,8 @@ struct bpf_map { u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; - int spin_lock_off; /* >=0 valid offset, <0 error */ - struct btf_record *record; - int timer_off; /* >=0 valid offset, <0 error */ u32 id; + struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; @@ -257,9 +257,29 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline const char *btf_field_type_name(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + return "bpf_spin_lock"; + case BPF_TIMER: + return "bpf_timer"; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return "kptr"; + default: + WARN_ON_ONCE(1); + return "unknown"; + } +} + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return sizeof(struct bpf_spin_lock); + case BPF_TIMER: + return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); @@ -272,6 +292,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return __alignof__(struct bpf_spin_lock); + case BPF_TIMER: + return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); @@ -288,22 +312,8 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline bool map_value_has_spin_lock(const struct bpf_map *map) -{ - return map->spin_lock_off >= 0; -} - -static inline bool map_value_has_timer(const struct bpf_map *map) -{ - return map->timer_off >= 0; -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - if (unlikely(map_value_has_spin_lock(map))) - memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); - if (unlikely(map_value_has_timer(map))) - memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); if (!IS_ERR_OR_NULL(map->record)) { struct btf_field *fields = map->record->fields; u32 cnt = map->record->cnt; @@ -1740,6 +1750,7 @@ void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 9e62717cdc7a..282006abd062 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,7 +163,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From f71b2f64177a199d5b1d2047e155d45fd98f564a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:57 +0530 Subject: bpf: Refactor map->off_arr handling Refactor map->off_arr handling into generic functions that can work on their own without hardcoding map specific code. The btf_fields_offs structure is now returned from btf_parse_field_offs, which can be reused later for types in program BTF. All functions like copy_map_value, zero_map_value call generic underlying functions so that they can also be reused later for copying to values allocated in programs which encode specific fields. Later, some helper functions will also require access to this btf_field_offs structure to be able to skip over special fields at runtime. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 ++++++++++++++++++++++++----------------- include/linux/btf.h | 1 + 2 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aae85019abde..798aec816970 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -341,57 +341,64 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) +static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, + void *dst, void *src, u32 size, + bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { + if (likely(!foffs)) { if (long_memcpy) - bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + bpf_long_memcpy(dst, src, round_up(size, 8)); else - memcpy(dst, src, map->value_size); + memcpy(dst, src, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); + memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, false); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, true); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); } -static inline void zero_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { - memset(dst, 0, map->value_size); + if (likely(!foffs)) { + memset(dst, 0, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memset(dst + curr_off, 0, map->value_size - curr_off); + memset(dst + curr_off, 0, size - curr_off); +} + +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_memzero(map->field_offs, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 282006abd062..d80345fa566b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -165,6 +165,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From 2b6c0e152868c9c5939a5c5094d5b2be61cf48e6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 28 Oct 2022 11:23:32 +0200 Subject: bcma: Use the proper gpio include The is including the legacy header to obtain struct gpio_chip. Instead, include where this struct is defined. It turns out that the brcm80211 brcmsmac depends on this to bring in the symbol gpio_is_valid(). The driver looks up the BCMA parent GPIO driver and checks that this succeeds, but then it goes on to use the deprecated GPIO call gpio_is_valid() to check the consistency of the .base member of the BCMA GPIO struct. The whole check can be dropped because the bcma_gpio is initialized in the declarations: struct gpio_chip *bcma_gpio = &cc_drv->gpio; And this can never be NULL. Cc: Jonas Gorski Acked-by: Arend van Spriel Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221028092332.238728-1-linus.walleij@linaro.org --- include/linux/bcma/bcma_driver_chipcommon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 2d94c30ed439..0cb6638b55e5 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -4,7 +4,7 @@ #include #include -#include +#include /** ChipCommon core registers. **/ #define BCMA_CC_ID 0x0000 -- cgit From 6c6871cdaef96361f6b79a3e45d451a6475df4d6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 4 Nov 2022 11:01:27 +0100 Subject: spi: Merge spi_controller.{slave,target}_abort() Mixing SPI slave/target handlers and SPI slave/target controllers using legacy and modern naming does not work well: there are now two different callbacks for aborting a slave/target operation, of which only one is populated, while spi_{slave,target}_abort() check and use only one, which may be the unpopulated one. Fix this by merging the slave/target abort callbacks into a single callback using a union, like is already done for the slave/target flags. Fixes: b8d3b056a78dcc94 ("spi: introduce new helpers with using modern naming") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/809c82d54b85dd87ef7ee69fc93016085be85cec.1667555967.git.geert+renesas@glider.be Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 798c30229e07..9a32495fbb1f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -655,8 +655,10 @@ struct spi_controller { struct spi_message *message); int (*unprepare_message)(struct spi_controller *ctlr, struct spi_message *message); - int (*slave_abort)(struct spi_controller *ctlr); - int (*target_abort)(struct spi_controller *ctlr); + union { + int (*slave_abort)(struct spi_controller *ctlr); + int (*target_abort)(struct spi_controller *ctlr); + }; /* * These hooks are for drivers that use a generic implementation -- cgit From b10b9c342f7571f287fd422be5d5c0beb26ba974 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 10 Oct 2022 12:31:21 -0400 Subject: lsm: make security_socket_getpeersec_stream() sockptr_t safe Commit 4ff09db1b79b ("bpf: net: Change sk_getsockopt() to take the sockptr_t argument") made it possible to call sk_getsockopt() with both user and kernel address space buffers through the use of the sockptr_t type. Unfortunately at the time of conversion the security_socket_getpeersec_stream() LSM hook was written to only accept userspace buffers, and in a desire to avoid having to change the LSM hook the commit author simply passed the sockptr_t's userspace buffer pointer. Since the only sk_getsockopt() callers at the time of conversion which used kernel sockptr_t buffers did not allow SO_PEERSEC, and hence the security_socket_getpeersec_stream() hook, this was acceptable but also very fragile as future changes presented the possibility of silently passing kernel space pointers to the LSM hook. There are several ways to protect against this, including careful code review of future commits, but since relying on code review to catch bugs is a recipe for disaster and the upstream eBPF maintainer is "strongly against defensive programming", this patch updates the LSM hook, and all of the implementations to support sockptr_t and safely handle both user and kernel space buffers. Acked-by: Casey Schaufler Acked-by: John Johansen Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/lsm_hooks.h | 4 ++-- include/linux/security.h | 11 +++++++---- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ec119da1d89b..6abde829b6e5 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -302,7 +302,7 @@ LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how) LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock, - char __user *optval, int __user *optlen, unsigned len) + sockptr_t optval, sockptr_t optlen, unsigned int len) LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock, struct sk_buff *skb, u32 *secid) LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 9a7e69aff3d1..2831efebde69 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -949,8 +949,8 @@ * SO_GETPEERSEC. For tcp sockets this can be meaningful if the * socket is associated with an ipsec SA. * @sock is the local socket. - * @optval userspace memory where the security state is to be copied. - * @optlen userspace int where the module should copy the actual length + * @optval memory where the security state is to be copied. + * @optlen memory where the module should copy the actual length * of the security state. * @len as input is the maximum length to copy to userspace provided * by the caller. diff --git a/include/linux/security.h b/include/linux/security.h index ca1b7109c0db..0e419c595cee 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -31,6 +31,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -1411,8 +1412,8 @@ int security_socket_getsockopt(struct socket *sock, int level, int optname); int security_socket_setsockopt(struct socket *sock, int level, int optname); int security_socket_shutdown(struct socket *sock, int how); int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb); -int security_socket_getpeersec_stream(struct socket *sock, char __user *optval, - int __user *optlen, unsigned len); +int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, + sockptr_t optlen, unsigned int len); int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid); int security_sk_alloc(struct sock *sk, int family, gfp_t priority); void security_sk_free(struct sock *sk); @@ -1548,8 +1549,10 @@ static inline int security_sock_rcv_skb(struct sock *sk, return 0; } -static inline int security_socket_getpeersec_stream(struct socket *sock, char __user *optval, - int __user *optlen, unsigned len) +static inline int security_socket_getpeersec_stream(struct socket *sock, + sockptr_t optval, + sockptr_t optlen, + unsigned int len) { return -ENOPROTOOPT; } -- cgit From d08cb25556773c65efb21761b75f92c804ad0975 Mon Sep 17 00:00:00 2001 From: David Yang Date: Fri, 28 Oct 2022 10:01:01 +0800 Subject: net: mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood Support mode switch properly, which is not available before. If SoC has two Ethernet controllers, by setting both of them into MII mode, the first controller enters GMII mode, while the second controller is effectively disabled. This requires configuring (and maybe enabling) the second controller in the device tree, even though it cannot be used. Signed-off-by: David Yang Signed-off-by: David S. Miller --- include/linux/mv643xx_eth.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h index 3682ae75c7aa..145169be2ed8 100644 --- a/include/linux/mv643xx_eth.h +++ b/include/linux/mv643xx_eth.h @@ -8,6 +8,7 @@ #include #include +#include #define MV643XX_ETH_SHARED_NAME "mv643xx_eth" #define MV643XX_ETH_NAME "mv643xx_eth_port" @@ -59,6 +60,7 @@ struct mv643xx_eth_platform_data { */ int speed; int duplex; + phy_interface_t interface; /* * How many RX/TX queues to use. -- cgit From 6251d38059ae22304ede4f3748af9f795bdbf4fd Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Wed, 28 Sep 2022 19:28:34 -0500 Subject: ACPI: ARM Performance Monitoring Unit Table (APMT) initial support ARM Performance Monitoring Unit Table describes the properties of PMU support in ARM-based system. The APMT table contains a list of nodes, each represents a PMU in the system that conforms to ARM CoreSight PMU architecture. The properties of each node include information required to access the PMU (e.g. MMIO base address, interrupt number) and also identification. For more detailed information, please refer to the specification below: * APMT: https://developer.arm.com/documentation/den0117/latest * ARM Coresight PMU: https://developer.arm.com/documentation/ihi0091/latest The initial support adds the detection of APMT table and generic infrastructure to create platform devices for ARM CoreSight PMUs. Similar to IORT the root pointer of APMT is preserved during runtime and each PMU platform device is given a pointer to the corresponding APMT node. Signed-off-by: Besar Wicaksono Acked-by: Rafael J. Wysocki Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20220929002834.32664-1-bwicaksono@nvidia.com Signed-off-by: Will Deacon --- include/linux/acpi_apmt.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/acpi_apmt.h (limited to 'include/linux') diff --git a/include/linux/acpi_apmt.h b/include/linux/acpi_apmt.h new file mode 100644 index 000000000000..40bd634d082f --- /dev/null +++ b/include/linux/acpi_apmt.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ARM CoreSight PMU driver. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. + * + */ + +#ifndef __ACPI_APMT_H__ +#define __ACPI_APMT_H__ + +#include + +#ifdef CONFIG_ACPI_APMT +void acpi_apmt_init(void); +#else +static inline void acpi_apmt_init(void) { } +#endif /* CONFIG_ACPI_APMT */ + +#endif /* __ACPI_APMT_H__ */ -- cgit From fe40ffdb7656d1f9c42dd402740765ff8b418b17 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 30 Sep 2022 12:18:44 +0100 Subject: arm_pmu: rework ACPI probing The current ACPI PMU probing logic tries to associate PMUs with CPUs when the CPU is first brought online, in order to handle late hotplug, though PMUs are only registered during early boot, and so for late hotplugged CPUs this can only associate the CPU with an existing PMU. We tried to be clever and the have the arm_pmu_acpi_cpu_starting() callback allocate a struct arm_pmu when no matching instance is found, in order to avoid duplication of logic. However, as above this doesn't do anything useful for late hotplugged CPUs, and this requires us to allocate memory in an atomic context, which is especially problematic for PREEMPT_RT, as reported by Valentin and Pierre. This patch reworks the probing to detect PMUs for all online CPUs in the arm_pmu_acpi_probe() function, which is more aligned with how DT probing works. The arm_pmu_acpi_cpu_starting() callback only tries to associate CPUs with an existing arm_pmu instance, avoiding the problem of allocating in atomic context. Note that as we didn't previously register PMUs for late-hotplugged CPUs, this change doesn't result in a loss of existing functionality, though we will now warn when we cannot associate a CPU with a PMU. This change allows us to pull the hotplug callback registration into the arm_pmu_acpi_probe() function, as we no longer need the callbacks to be invoked shortly after probing the boot CPUs, and can register it without invoking the calls. For the moment the arm_pmu_acpi_init() initcall remains to register the SPE PMU, though in future this should probably be moved elsewhere (e.g. the arm64 ACPI init code), since this doesn't need to be tied to the regular CPU PMU code. Signed-off-by: Mark Rutland Reported-by: Valentin Schneider Link: https://lore.kernel.org/r/20210810134127.1394269-2-valentin.schneider@arm.com/ Reported-by: Pierre Gondois Link: https://lore.kernel.org/linux-arm-kernel/20220912155105.1443303-1-pierre.gondois@arm.com/ Cc: Pierre Gondois Cc: Valentin Schneider Cc: Will Deacon Reviewed-and-tested-by: Pierre Gondois Link: https://lore.kernel.org/r/20220930111844.1522365-4-mark.rutland@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0356cb6a215d..0c15c5b7f801 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -174,7 +174,6 @@ void kvm_host_pmu_init(struct arm_pmu *pmu); /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); -struct arm_pmu *armpmu_alloc_atomic(void); void armpmu_free(struct arm_pmu *pmu); int armpmu_register(struct arm_pmu *pmu); int armpmu_request_irq(int irq, int cpu); -- cgit From a50ae8c98e5766a4fcb78e76f13cc658b784eac1 Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Tue, 18 Oct 2022 19:02:05 +0200 Subject: mtd: nand: drop EXPORT_SYMBOL_GPL for nanddev_erase() This function is only used within this module, so it is no longer necessary to use EXPORT_SYMBOL_GPL(). Signed-off-by: Dario Binacchi Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20221018170205.1733958-1-dario.binacchi@amarulasolutions.com --- include/linux/mtd/nand.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index c3693bb87b4c..b2996dc987ff 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -999,7 +999,6 @@ static inline bool nanddev_io_iter_end(struct nand_device *nand, bool nanddev_isbad(struct nand_device *nand, const struct nand_pos *pos); bool nanddev_isreserved(struct nand_device *nand, const struct nand_pos *pos); -int nanddev_erase(struct nand_device *nand, const struct nand_pos *pos); int nanddev_markbad(struct nand_device *nand, const struct nand_pos *pos); /* ECC related functions */ -- cgit From e1f4ecab19338ec7079830e8700e4869f991fd45 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Nov 2022 17:13:01 +0000 Subject: net: remove explicit phylink_generic_validate() references Virtually all conventional network drivers are now converted to use phylink_generic_validate() - only DSA drivers and fman_memac remain, so lets remove the necessity for network drivers to explicitly set this member, and default to phylink_generic_validate() when unset. This is possible as .validate must currently be set. Any remaining instances that have not been addressed by this patch can be fixed up later. Signed-off-by: Russell King (Oracle) Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/E1or0FZ-001tRa-DI@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1df3e5e316e8..c492c26202b5 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -207,6 +207,11 @@ struct phylink_mac_ops { * * If the @state->interface mode is not supported, then the @supported * mask must be cleared. + * + * This member is optional; if not set, the generic validator will be + * used making use of @config->mac_capabilities and + * @config->supported_interfaces to determine which link modes are + * supported. */ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit From d667c94962c1c81ef587ac91dc5c01a1cfe339c7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 24 Oct 2022 16:14:34 +0800 Subject: mm/percpu: remove unused PERCPU_DYNAMIC_EARLY_SLOTS Since commit 40064aeca35c ("percpu: replace area map allocator with bitmap"), there's no place to use PERCPU_DYNAMIC_EARLY_SLOTS. So clean it up. Signed-off-by: Baoquan He Signed-off-by: Dennis Zhou --- include/linux/percpu.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index f1ec5ad1351c..70bc17db00a6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -37,11 +37,10 @@ /* * Percpu allocator can serve percpu allocations before slab is * initialized which allows slab to depend on the percpu allocator. - * The following two parameters decide how much resource to - * preallocate for this. Keep PERCPU_DYNAMIC_RESERVE equal to or - * larger than PERCPU_DYNAMIC_EARLY_SIZE. + * The following parameter decide how much resource to preallocate + * for this. Keep PERCPU_DYNAMIC_RESERVE equal to or larger than + * PERCPU_DYNAMIC_EARLY_SIZE. */ -#define PERCPU_DYNAMIC_EARLY_SLOTS 128 #define PERCPU_DYNAMIC_EARLY_SIZE (12 << 10) /* -- cgit From 9a0f830f80265bd1ef816e1541ac24bee80e9a3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:01:25 -0700 Subject: ethtool: linkstate: add a statistic for PHY down events The previous attempt to augment carrier_down (see Link) was not met with much enthusiasm so let's do the simple thing of exposing what some devices already maintain. Add a common ethtool statistic for link going down. Currently users have to maintain per-driver mapping to extract the right stat from the vendor-specific ethtool -S stats. carrier_down does not fit the bill because it counts a lot of software related false positives. Add the statistic to the extended link state API to steer vendors towards implementing all of it. Implement for bnxt and all Linux-controlled PHYs. mlx5 and (possibly) enic also have a counter for this but I leave the implementation to their maintainers. Link: https://lore.kernel.org/r/20220520004500.2250674-1-kuba@kernel.org Reviewed-by: Florian Fainelli Reviewed-by: Michael Chan Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221104190125.684910-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 17 +++++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 99dc7bfbcd3c..5c51c7fda32a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -125,6 +125,20 @@ struct ethtool_link_ext_state_info { }; }; +struct ethtool_link_ext_stats { + /* Custom Linux statistic for PHY level link down events. + * In a simpler world it should be equal to netdev->carrier_down_count + * unfortunately netdev also counts local reconfigurations which don't + * actually take the physical link down, not to mention NC-SI which, + * if present, keeps the link up regardless of host state. + * This statistic counts when PHY _actually_ went down, or lost link. + * + * Note that we need u64 for ethtool_stats_init() and comparisons + * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. + */ + u64 link_down_events; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -481,6 +495,7 @@ struct ethtool_module_power_mode_params { * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. + * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. @@ -652,6 +667,8 @@ struct ethtool_ops { u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); + void (*get_link_ext_stats)(struct net_device *dev, + struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/linux/phy.h b/include/linux/phy.h index ddf66198f751..9a3752c0c444 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -600,6 +600,7 @@ struct macsec_ops; * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine + * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * @@ -723,6 +724,8 @@ struct phy_device { int pma_extable; + unsigned int link_down_events; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); -- cgit From c3d96f690a790074b508fe183a41e36a00cd7ddd Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 3 Oct 2022 07:34:21 +0100 Subject: net, proc: Provide PROC_FS=n fallback for proc_create_net_single_write() Provide a CONFIG_PROC_FS=n fallback for proc_create_net_single_write(). Also provide a fallback for proc_create_net_data_write(). Fixes: 564def71765c ("proc: Add a way to make network proc files writable") Reported-by: kernel test robot Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/proc_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 81d6e4ec2294..0260f5ea98fe 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -208,8 +208,10 @@ static inline void proc_remove(struct proc_dir_entry *de) {} static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; } #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;}) +#define proc_create_net_data_write(name, mode, parent, ops, write, state_size, data) ({NULL;}) #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) +#define proc_create_net_single_write(name, mode, parent, show, write, data) ({NULL;}) static inline struct pid *tgid_pidfd_to_pid(const struct file *file) { -- cgit From 42fb06b391ace2aec5cdb1ebb8ff668f0a34332f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 08:49:29 +0100 Subject: net: Change the udp encap_err_rcv to allow use of {ip,ipv6}_icmp_error() Change the udp encap_err_rcv signature to match ip_icmp_error() and ipv6_icmp_error() so that those can be used from the called function and export them. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/udp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 5cdba00a904a..dea57aa37df6 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,7 +70,8 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); - void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); -- cgit From c14f7ccc9f5dcf9d06ddeec706f85405b2c80600 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Thu, 14 Jul 2022 20:41:30 +0200 Subject: PCI: Assign PCI domain IDs by ida_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace assignment of PCI domain IDs from atomic_inc_return() to ida_alloc(). Use two IDAs, one for static domain allocations (those which are defined in device tree) and second for dynamic allocations (all other). During removal of root bus / host bridge, also release the domain ID. The released ID can be reused again, for example when dynamically loading and unloading native PCI host bridge drivers. This change also allows to mix static device tree assignment and dynamic by kernel as all static allocations are reserved in dynamic pool. [bhelgaas: set "err" if "bus->domain_nr < 0"] Link: https://lore.kernel.org/r/20220714184130.5436-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 2bda4a4e47e8..28af4414f789 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1726,6 +1726,7 @@ static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) { return 0; } #endif int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); +void pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent); #endif /* Some architectures require additional setup to direct VGA traffic */ -- cgit From e6c7e6216dc628ab7c627a6bcda7349715bbb67e Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Tue, 8 Nov 2022 19:23:27 +0100 Subject: soc: mediatek: Add all settings to mtk_mmsys_ddp_dpi_fmt_config func MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The difference between MT8186 and other ICs is that when modifying the output format, we need to modify the mmsys_base+0x400 register to take effect. So when setting the dpi output format, we need to call mtk_mmsys_ddp_dpi_fmt_config to set it to MT8186 synchronously. Commit a071e52f75d1 ("soc: mediatek: Add mmsys func to adapt to dpi output for MT8186") lacked some of the possible output formats and also had a wrong bitmask. Add the missing output formats and fix the bitmask. While at it, also update mtk_mmsys_ddp_dpi_fmt_config() to use generic formats, so that it is slightly easier to extend for other platforms. Fixes: a071e52f75d1 ("soc: mediatek: Add mmsys func to adapt to dpi output for MT8186") Signed-off-by: Xinlei Lee Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: CK Hu Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-mmsys.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index d2b02bb43768..b85f66db33e1 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -9,6 +9,13 @@ enum mtk_ddp_comp_id; struct device; +enum mtk_dpi_out_format_con { + MTK_DPI_RGB888_SDR_CON, + MTK_DPI_RGB888_DDR_CON, + MTK_DPI_RGB565_SDR_CON, + MTK_DPI_RGB565_DDR_CON +}; + enum mtk_ddp_comp_id { DDP_COMPONENT_AAL0, DDP_COMPONENT_AAL1, -- cgit From 57a196a58421a4b0c45949ae7309f21829aaa77f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sun, 18 Sep 2022 19:13:48 -0700 Subject: hugetlb: simplify hugetlb handling in follow_page_mask During discussions of this series [1], it was suggested that hugetlb handling code in follow_page_mask could be simplified. At the beginning of follow_page_mask, there currently is a call to follow_huge_addr which 'may' handle hugetlb pages. ia64 is the only architecture which provides a follow_huge_addr routine that does not return error. Instead, at each level of the page table a check is made for a hugetlb entry. If a hugetlb entry is found, a call to a routine associated with that entry is made. Currently, there are two checks for hugetlb entries at each page table level. The first check is of the form: if (p?d_huge()) page = follow_huge_p?d(); the second check is of the form: if (is_hugepd()) page = follow_huge_pd(). We can replace these checks, as well as the special handling routines such as follow_huge_p?d() and follow_huge_pd() with a single routine to handle hugetlb vmas. A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the beginning of follow_page_mask. hugetlb_follow_page_mask will use the existing routine huge_pte_offset to walk page tables looking for hugetlb entries. huge_pte_offset can be overwritten by architectures, and already handles special cases such as hugepd entries. [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/ [mike.kravetz@oracle.com: remove vma (pmd sharing) per Peter] Link: https://lkml.kernel.org/r/20221028181108.119432-1-mike.kravetz@oracle.com [mike.kravetz@oracle.com: remove left over hugetlb_vma_unlock_read()] Link: https://lkml.kernel.org/r/20221030225825.40872-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220919021348.22151-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Aneesh Kumar K.V Cc: Christophe Leroy Cc: Michael Ellerman Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 50 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8b4f93e84868..4a76c0fc6bbf 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -149,6 +149,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -209,17 +211,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift); -struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, - int flags); -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags); -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, - pgd_t *pgd, int flags); void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); @@ -272,6 +263,12 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ +} + static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, @@ -282,12 +279,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm, return 0; } -static inline struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, @@ -320,31 +311,6 @@ static inline void hugetlb_show_meminfo_node(int nid) { } -static inline struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, - int pdshift) -{ - return NULL; -} - -static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, - unsigned long address, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pud(struct mm_struct *mm, - unsigned long address, pud_t *pud, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pgd(struct mm_struct *mm, - unsigned long address, pgd_t *pgd, int flags) -{ - return NULL; -} - static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { -- cgit From 0538a82c39e94d49fa6985c6a0101ca819be11ee Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Oct 2022 15:31:13 -0400 Subject: mm: vmscan: make rotations a secondary factor in balancing anon vs file We noticed a 2% webserver throughput regression after upgrading from 5.6. This could be tracked down to a shift in the anon/file reclaim balance (confirmed with swappiness) that resulted in worse reclaim efficiency and thus more kswapd activity for the same outcome. The change that exposed the problem is aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU"). By qualifying swapins based on their refault distance, it lowered the cost of anon reclaim in this workload, in turn causing (much) more anon scanning than before. Scanning the anon list is more expensive due to the higher ratio of mmapped pages that may rotate during reclaim, and so the result was an increase in %sys time. Right now, rotations aren't considered a cost when balancing scan pressure between LRUs. We can end up with very few file refaults putting all the scan pressure on hot anon pages that are rotated en masse, don't get reclaimed, and never push back on the file LRU again. We still only reclaim file cache in that case, but we burn a lot CPU rotating anon pages. It's "fair" from an LRU age POV, but doesn't reflect the real cost it imposes on the system. Consider rotations as a secondary factor in balancing the LRUs. This doesn't attempt to make a precise comparison between IO cost and CPU cost, it just says: if reloads are about comparable between the lists, or rotations are overwhelmingly different, adjust for CPU work. This fixed the regression on our webservers. It has since been deployed to the entire Meta fleet and hasn't caused any problems. Link: https://lkml.kernel.org/r/20221013193113.726425-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a18cf4b7c724..369d7799205d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -void lru_note_cost_folio(struct folio *); +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated); +void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void lru_cache_add(struct page *); -- cgit From d03c376d9066532551dc56837c7c5490e4fcbbfe Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:03 -0500 Subject: mm/hugetlb: add folio support to hugetlb specific flag macros Patch series "begin converting hugetlb code to folios", v4. This patch series starts the conversion of the hugetlb code to operate on struct folios rather than struct pages. This removes the ambiguitiy of whether functions are operating on head pages, tail pages of compound pages, or base pages. This series passes the linux test project hugetlb test cases. Patch 1 adds hugeltb specific page macros that can operate on folios. Patch 2 adds the private field of the first tail page to struct page. For 32-bit, _private_1 alinging with page[1].private was confirmed by using pahole. Patch 3 introduces hugetlb subpool helper functions which operate on struct folios. These patches were tested using the hugepage-mmap.c selftest along with the migratepages command. Patch 4 converts hugetlb_delete_from_page_cache() to use folios. Patch 5 adds a folio_hstate() function to get hstate information from a folio and adds a user of folio_hstate(). Bpftrace was used to track time spent in the free_huge_pages function during the ltp test cases as it is a caller of the hugetlb subpool functions. From the histogram, the performance is similar before and after the patch series. Time spent in 'free_huge_page' 6.0.0-rc2.master.20220823 @nsecs: [256, 512) 14770 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ |@@@@@@@@@@@@@@@@@@@@@@@@@ | [512, 1K) 155 | | [1K, 2K) 169 | | [2K, 4K) 50 | | [4K, 8K) 14 | | [8K, 16K) 3 | | [16K, 32K) 3 | | 6.0.0-rc2.master.20220823 + patch series @nsecs: [256, 512) 13678 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |@@@@@@@@@@@@@@@@@@@@@@@@@ | [512, 1K) 142 | | [1K, 2K) 199 | | [2K, 4K) 44 | | [4K, 8K) 13 | | [8K, 16K) 4 | | [16K, 32K) 1 | | This patch (of 5): Allow the macros which test, set, and clear hugetlb specific page flags to take a hugetlb folio as an input. The macrros are generated as folio_{test, set, clear}_hugetlb_{restore_reserve, migratable, temporary, freed, vmemmap_optimized, raw_hwp_unreliable}. Link: https://lkml.kernel.org/r/20220922154207.1575343-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20220922154207.1575343-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4a76c0fc6bbf..3ff5d2dd3ca3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -589,26 +589,50 @@ enum hugetlb_page_flags { */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ +static __always_inline \ +bool folio_test_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + return test_bit(HPG_##flname, private); \ + } \ static inline int HPage##uname(struct page *page) \ { return test_bit(HPG_##flname, &(page->private)); } #define SETHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_set_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + set_bit(HPG_##flname, private); \ + } \ static inline void SetHPage##uname(struct page *page) \ { set_bit(HPG_##flname, &(page->private)); } #define CLEARHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_clear_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + clear_bit(HPG_##flname, private); \ + } \ static inline void ClearHPage##uname(struct page *page) \ { clear_bit(HPG_##flname, &(page->private)); } #else #define TESTHPAGEFLAG(uname, flname) \ +static inline bool \ +folio_test_hugetlb_##flname(struct folio *folio) \ + { return 0; } \ static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ +static inline void \ +folio_set_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ +static inline void \ +folio_clear_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void ClearHPage##uname(struct page *page) \ { } #endif -- cgit From d340625f4849ab5dbfebbc7d84709fbfcd39e52f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:04 -0500 Subject: mm: add private field of first tail to struct page and struct folio Allow struct folio to store hugetlb metadata that is contained in the private field of the first tail page. On 32-bit, _private_1 aligns with page[1].private. Link: https://lkml.kernel.org/r/20220922154207.1575343-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 500e536796ca..2d5b1575ffe0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -144,6 +144,7 @@ struct page { atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ + unsigned long _private_1; #endif }; struct { /* Second tail page of compound page */ @@ -264,6 +265,7 @@ struct page { * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_private_1: Do not use directly, call folio_get_private_1(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -311,6 +313,7 @@ struct folio { #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + unsigned long _private_1; }; #define FOLIO_MATCH(pg, fl) \ @@ -338,6 +341,7 @@ FOLIO_MATCH(compound_mapcount, _total_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); +FOLIO_MATCH(_private_1, _private_1); #endif #undef FOLIO_MATCH @@ -383,6 +387,16 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } +static inline void folio_set_private_1(struct folio *folio, unsigned long private) +{ + folio->_private_1 = private; +} + +static inline unsigned long folio_get_private_1(struct folio *folio) +{ + return folio->_private_1; +} + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) -- cgit From 149562f7509404c382c32c3fa8a6ba356135e5cf Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:05 -0500 Subject: mm/hugetlb: add hugetlb_folio_subpool() helpers Allow hugetlbfs_migrate_folio to check and read subpool information by passing in a folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ff5d2dd3ca3..496d02bdb997 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -718,18 +718,29 @@ extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return (void *)folio_get_private_1(folio); +} + /* * hugetlb page subpool pointer located in hpage[1].private */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { - return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL); + return hugetlb_folio_subpool(page_folio(hpage)); +} + +static inline void hugetlb_set_folio_subpool(struct folio *folio, + struct hugepage_subpool *subpool) +{ + folio_set_private_1(folio, (unsigned long)subpool); } static inline void hugetlb_set_page_subpool(struct page *hpage, struct hugepage_subpool *subpool) { - set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool); + hugetlb_set_folio_subpool(page_folio(hpage), subpool); } static inline struct hstate *hstate_file(struct file *f) -- cgit From ece62684dcfb714b7d8452056b4a33d426b16457 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:06 -0500 Subject: hugetlbfs: convert hugetlb_delete_from_page_cache() to use folios Remove the last caller of delete_from_page_cache() by converting the code to its folio equivalent. Link: https://lkml.kernel.org/r/20220922154207.1575343-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb4044222..060ee98474ef 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); -void delete_from_page_cache(struct page *page); void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, -- cgit From e51da3a9b6c2f67879880259a25c51dbda01c462 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:07 -0500 Subject: mm/hugetlb: add folio_hstate() Helper function to retrieve hstate information from a hugetlb folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: kernel test robot Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 496d02bdb997..20a0d5a08395 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -824,10 +824,15 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +static inline struct hstate *folio_hstate(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + return size_to_hstate(folio_size(folio)); +} + static inline struct hstate *page_hstate(struct page *page) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(page_size(page)); + return folio_hstate(page_folio(page)); } static inline unsigned hstate_index_to_shift(unsigned index) @@ -1036,6 +1041,11 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma) return NULL; } +static inline struct hstate *folio_hstate(struct folio *folio) +{ + return NULL; +} + static inline struct hstate *page_hstate(struct page *page) { return NULL; -- cgit From 3e0ee843427a573e3e1187a5331e4b7fb00a76f3 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 7 Oct 2022 13:37:41 +0200 Subject: mm: fix typo in struct vm_operations_struct comments There is no eprotect(), so I assume this is about mprotect(). Link: https://lkml.kernel.org/r/2385684.8vm7BOzihM@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..f6d2d2d9e284 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -549,7 +549,7 @@ struct vm_operations_struct { /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not - * be modified. Returns 0 if eprotect() can proceed. + * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); -- cgit From eafd296e0cc0cc03b4ae01c2b3b07273514d757c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:46 +0800 Subject: memory: remove unused register_hotmemory_notifier() Remove unused register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-8-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- include/linux/memory.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index aa619464a1df..98d2a2ebcc10 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,7 +19,6 @@ #include #include #include -#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -136,9 +135,6 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } -/* These aren't inline functions due to a GCC bug. */ -#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) -#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -166,8 +162,6 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, { .notifier_call = fn, .priority = pri };\ register_memory_notifier(&fn##_mem_nb); \ }) -#define register_hotmemory_notifier(nb) register_memory_notifier(nb) -#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #ifdef CONFIG_NUMA void memory_block_add_nid(struct memory_block *mem, int nid, -- cgit From 1eeaa4fd39b0b1b3e986f8eab6978e69b01e3c5e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:47 +0800 Subject: memory: move hotplug memory notifier priority to same file for easy sorting The priority of hotplug memory callback is defined in a different file. And there are some callers using numbers directly. Collect them together into include/linux/memory.h for easy reading. This allows us to sort their priorities more intuitively without additional comments. Link: https://lkml.kernel.org/r/20220923033347.3935160-9-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Christoph Lameter Cc: David Hildenbrand Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 1 - include/linux/memory.h | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 965009aa01d7..fc9647b1b4f9 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -18,7 +18,6 @@ * the same memory tier. */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) -#define MEMTIER_HOTPLUG_PRIO 100 struct memory_tier; struct memory_dev_type { diff --git a/include/linux/memory.h b/include/linux/memory.h index 98d2a2ebcc10..463662ef7614 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,8 +112,13 @@ struct mem_section; * Priorities for the hotplug memory callback routines (stored in decreasing * order in the callback chain) */ -#define SLAB_CALLBACK_PRI 1 -#define IPC_CALLBACK_PRI 10 +#define DEFAULT_CALLBACK_PRI 0 +#define SLAB_CALLBACK_PRI 1 +#define HMAT_CALLBACK_PRI 2 +#define MM_COMPUTE_BATCH_PRI 10 +#define CPUSET_CALLBACK_PRI 10 +#define MEMTIER_HOTPLUG_PRI 100 +#define KSM_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) -- cgit From c5255b421fd04ba6a405809b7216a3b6ebd5493a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Oct 2022 19:33:32 +0100 Subject: mm: remove FGP_HEAD This is no longer used; all callers have been converted to use folios instead. Somehow this manages to save 11 bytes of text. Link: https://lkml.kernel.org/r/20221019183332.2802139-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 060ee98474ef..b33ab86d5dca 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -504,9 +504,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 -#define FGP_HEAD 0x00000080 -#define FGP_ENTRY 0x00000100 -#define FGP_STABLE 0x00000200 +#define FGP_ENTRY 0x00000080 +#define FGP_STABLE 0x00000100 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); -- cgit From 6e2be1f2ebcea42ed6044432f72f32434e60b34d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:52 +0200 Subject: compiler-gcc: be consistent with underscores use for `no_sanitize` Patch series "compiler-gcc: be consistent with underscores use for `no_sanitize`". This patch (of 5): Other macros that define shorthands for attributes in e.g. `compiler_attributes.h` and elsewhere use underscores. Link: https://lkml.kernel.org/r/20221021115956.9947-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f55a37efdb97..b9530d3515ac 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -83,25 +83,25 @@ #endif #if __has_attribute(__no_sanitize_address__) -#define __no_sanitize_address __attribute__((no_sanitize_address)) +#define __no_sanitize_address __attribute__((__no_sanitize_address__)) #else #define __no_sanitize_address #endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) -#define __no_sanitize_thread __attribute__((no_sanitize_thread)) +#define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread #endif #if __has_attribute(__no_sanitize_undefined__) -#define __no_sanitize_undefined __attribute__((no_sanitize_undefined)) +#define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) #else #define __no_sanitize_undefined #endif #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) -#define __no_sanitize_coverage __attribute__((no_sanitize_coverage)) +#define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else #define __no_sanitize_coverage #endif -- cgit From ae37a9a2c2d0960d643d782b426ea1aa9c05727a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:53 +0200 Subject: compiler-gcc: remove attribute support check for `__no_sanitize_address__` The attribute was added in GCC 4.8, while the minimum GCC version supported by the kernel is GCC 5.1. Therefore, remove the check. Link: https://godbolt.org/z/84v56vcn8 Link: https://lkml.kernel.org/r/20221021115956.9947-2-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Marco Elver Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index b9530d3515ac..bfce7f4d0978 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -82,11 +82,7 @@ #define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#if __has_attribute(__no_sanitize_address__) #define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#else -#define __no_sanitize_address -#endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) -- cgit From 095ac0763ac507dd4e1a71ad9784f49f51498483 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:54 +0200 Subject: compiler-gcc: remove attribute support check for `__no_sanitize_thread__` The attribute was added in GCC 5.1, which matches the minimum GCC version supported by the kernel. Therefore, remove the check. Link: https://godbolt.org/z/vbxKejxbx Link: https://lkml.kernel.org/r/20221021115956.9947-3-ojeda@kernel.org Signed-off-by: Miguel Ojeda Acked-by: Marco Elver Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index bfce7f4d0978..ba207deb77ca 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -84,7 +84,7 @@ #define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) +#if defined(__SANITIZE_THREAD__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread -- cgit From 689540cbda7f69594ae5e13fef4c8239519d8b66 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:55 +0200 Subject: compiler-gcc: remove attribute support check for `__no_sanitize_undefined__` The attribute was added in GCC 4.9, while the minimum GCC version supported by the kernel is GCC 5.1. Therefore, remove the check. Link: https://godbolt.org/z/GrMeo6fYr Link: https://lkml.kernel.org/r/20221021115956.9947-4-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Marco Elver Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index ba207deb77ca..7f2c2bb73815 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -90,11 +90,7 @@ #define __no_sanitize_thread #endif -#if __has_attribute(__no_sanitize_undefined__) #define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) -#else -#define __no_sanitize_undefined -#endif #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) #define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) -- cgit From f39556bc2530c83a22bc11b73c7a46df9a340685 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:56 +0200 Subject: compiler-gcc: document minimum version for `__no_sanitize_coverage__` The attribute was added in GCC 12.1. This will simplify future cleanups, and is closer to what we do in `compiler_attributes.h`. Link: https://godbolt.org/z/MGbT76j6G Link: https://lkml.kernel.org/r/20221021115956.9947-5-ojeda@kernel.org Signed-off-by: Miguel Ojeda Acked-by: Marco Elver Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7f2c2bb73815..7af9e34ec261 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -92,6 +92,9 @@ #define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) +/* + * Only supported since gcc >= 12 + */ #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) #define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else -- cgit From e591ef7d96d6ea249916f351dc26a636e565c635 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:09 +0900 Subject: mm,hwpoison,hugetlb,memory_hotplug: hotremove memory section with hwpoisoned hugepage Patch series "mm, hwpoison: improve handling workload related to hugetlb and memory_hotplug", v7. This patchset tries to solve the issue among memory_hotplug, hugetlb and hwpoison. In this patchset, memory hotplug handles hwpoison pages like below: - hwpoison pages should not prevent memory hotremove, - memory block with hwpoison pages should not be onlined. This patch (of 4): HWPoisoned page is not supposed to be accessed once marked, but currently such accesses can happen during memory hotremove because do_migrate_range() can be called before dissolve_free_huge_pages() is called. Clear HPageMigratable for hwpoisoned hugepages to prevent them from being migrated. This should be done in hugetlb_lock to avoid race against isolate_hugetlb(). get_hwpoison_huge_page() needs to have a flag to show it's called from unpoison to take refcount of hwpoisoned hugepages, so add it. [naoya.horiguchi@linux.dev: remove TestClearHPageMigratable and reduce to test and clear separately] Link: https://lkml.kernel.org/r/20221025053559.GA2104800@ik1-406-35019.vs.sakura.ne.jp Link: https://lkml.kernel.org/r/20221024062012.1520887-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20221024062012.1520887-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: Miaohe Lin Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++++---- include/linux/mm.h | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20a0d5a08395..65ea34022aa2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -183,8 +183,9 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb); -int get_huge_page_for_hwpoison(unsigned long pfn, int flags); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); @@ -391,12 +392,13 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { return 0; } -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index f6d2d2d9e284..e2ac6fff03a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3277,9 +3277,11 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); #else -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } -- cgit From d027122d8363e58cd8bc2fa6a16917f7f69b85bb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:10 +0900 Subject: mm/hwpoison: move definitions of num_poisoned_pages_* to memory-failure.c These interfaces will be used by drivers/base/memory.c by later patch, so as a preparatory work move them to more common header file visible to the file. Link: https://lkml.kernel.org/r/20221024062012.1520887-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/swapops.h | 24 ++---------------------- 2 files changed, 7 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e2ac6fff03a8..c667a6c4b657 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,12 +3279,17 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); +extern void num_poisoned_pages_inc(void); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } + +static inline void num_poisoned_pages_inc(void) +{ +} #endif #ifndef arch_memory_failure diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 86b95ccb81bb..3ba9bf56899d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -581,8 +581,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) #ifdef CONFIG_MEMORY_FAILURE -extern atomic_long_t num_poisoned_pages __read_mostly; - /* * Support for hardware poisoned pages */ @@ -597,17 +595,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline void num_poisoned_pages_inc(void) -{ - atomic_long_inc(&num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long i) -{ - atomic_long_sub(i, &num_poisoned_pages); -} - -#else /* CONFIG_MEMORY_FAILURE */ +#else static inline swp_entry_t make_hwpoison_entry(struct page *page) { @@ -618,15 +606,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } - -static inline void num_poisoned_pages_inc(void) -{ -} - -static inline void num_poisoned_pages_sub(long i) -{ -} -#endif /* CONFIG_MEMORY_FAILURE */ +#endif static inline int non_swap_entry(swp_entry_t entry) { -- cgit From a46c9304b4bbf1b164154976cbb7e648980c7b5b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:11 +0900 Subject: mm/hwpoison: pass pfn to num_poisoned_pages_*() No functional change. Link: https://lkml.kernel.org/r/20221024062012.1520887-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c667a6c4b657..78ae2ee09a24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,7 +3279,7 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -extern void num_poisoned_pages_inc(void); +extern void num_poisoned_pages_inc(unsigned long pfn); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) @@ -3287,7 +3287,7 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void num_poisoned_pages_inc(void) +static inline void num_poisoned_pages_inc(unsigned long pfn) { } #endif -- cgit From 5033091de814ab4b5623faed2755f3064e19e2d2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:12 +0900 Subject: mm/hwpoison: introduce per-memory_block hwpoison counter Currently PageHWPoison flag does not behave well when experiencing memory hotremove/hotplug. Any data field in struct page is unreliable when the associated memory is offlined, and the current mechanism can't tell whether a memory block is onlined because a new memory devices is installed or because previous failed offline operations are undone. Especially if there's a hwpoisoned memory, it's unclear what the best option is. So introduce a new mechanism to make struct memory_block remember that a memory block has hwpoisoned memory inside it. And make any online event fail if the onlining memory block contains hwpoison. struct memory_block is freed and reallocated over ACPI-based hotremove/hotplug, but not over sysfs-based hotremove/hotplug. So the new counter can distinguish these cases. Link: https://lkml.kernel.org/r/20221024062012.1520887-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/memory.h | 3 +++ include/linux/mm.h | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 463662ef7614..31343566c221 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -84,6 +84,9 @@ struct memory_block { unsigned long nr_vmemmap_pages; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) + atomic_long_t nr_hwpoison; +#endif }; int arch_get_memory_phys_device(unsigned long start_pfn); diff --git a/include/linux/mm.h b/include/linux/mm.h index 78ae2ee09a24..429ff89bfe06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,7 +3279,8 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -extern void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) @@ -3290,6 +3291,23 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, static inline void num_poisoned_pages_inc(unsigned long pfn) { } + +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ +} +#endif + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +#else +static inline void memblk_nr_poison_inc(unsigned long pfn) +{ +} + +static inline void memblk_nr_poison_sub(unsigned long pfn, long i) +{ +} #endif #ifndef arch_memory_failure -- cgit From ea0ffd0c08d0fef1f6e93eb07badbeeabf6b43d6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 24 Oct 2022 00:25:33 +0800 Subject: swap: add a limit for readahead page-cluster value Currenty there is no upper limit for /proc/sys/vm/page-cluster, and it's a bit shift value, so it could result in overflow of the 32-bit integer. Add a reasonable upper limit for it, read-in at most 2**31 pages, which is a large enough value for readahead. Link: https://lkml.kernel.org/r/20221023162533.81561-1-ryncsn@gmail.com Signed-off-by: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 429ff89bfe06..255931ebf2dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count) extern void * high_memory; extern int page_cluster; +extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; -- cgit From 732ea9db9d8a6a0444d18ed810cccb2428d8766b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 11 Oct 2022 17:10:39 +0200 Subject: efi: libstub: Move screen_info handling to common code Currently, arm64, RISC-V and LoongArch rely on the fact that struct screen_info can be accessed directly, due to the fact that the EFI stub and the core kernel are part of the same image. This will change after a future patch, so let's ensure that the screen_info handling is able to deal with this, by adopting the arm32 approach of passing it as a configuration table. While at it, switch to ACPI reclaim memory to hold the screen_info data, which is more appropriate for this kind of allocation. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 929d559ad41d..dfa6cc8bbef9 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -403,7 +403,7 @@ void efi_native_runtime_setup(void); * structure that was populated by the stub based on the GOP protocol instance * associated with ConOut */ -#define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, 0xb9, 0x0e, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95) +#define LINUX_EFI_SCREEN_INFO_TABLE_GUID EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, 0xb9, 0x0e, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95) #define LINUX_EFI_ARM_CPU_STATE_TABLE_GUID EFI_GUID(0xef79e4aa, 0x3c3d, 0x4989, 0xb9, 0x02, 0x07, 0xa9, 0x43, 0xe5, 0x50, 0xd2) #define LINUX_EFI_LOADER_ENTRY_GUID EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f) #define LINUX_EFI_RANDOM_SEED_TABLE_GUID EFI_GUID(0x1ce1e5bc, 0x7ceb, 0x42f2, 0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b) -- cgit From c51e97e7f1295d3cf42682565506047f08dfc99b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Oct 2022 12:42:07 +0200 Subject: efi: libstub: Merge zboot decompressor with the ordinary stub Even though our EFI zboot decompressor is pedantically spec compliant and idiomatic for EFI image loaders, calling LoadImage() and StartImage() for the nested image is a bit of a burden. Not only does it create workflow issues for the distros (as both the inner and outer PE/COFF images need to be signed for secure boot), it also copies the image around in memory numerous times: - first, the image is decompressed into a buffer; - the buffer is consumed by LoadImage(), which copies the sections into a newly allocated memory region to hold the executable image; - once the EFI stub is invoked by StartImage(), it will also move the image in memory in case of KASLR, mirrored memory or if the image must execute from a certain a priori defined address. There are only two EFI spec compliant ways to load code into memory and execute it: - use LoadImage() and StartImage(), - call ExitBootServices() and take ownership of the entire system, after which anything goes. Given that the EFI zboot decompressor always invokes the EFI stub, and given that both are built from the same set of objects, let's merge the two, so that we can avoid LoadImage()/StartImage but still load our image into memory without breaking the above rules. This also means we can decompress the image directly into its final location, which could be randomized or meet other platform specific constraints that LoadImage() does not know how to adhere to. It also means that, even if the encapsulated image still has the EFI stub incorporated as well, it does not need to be signed for secure boot when wrapping it in the EFI zboot decompressor. In the future, we might decide to retire the EFI stub attached to the decompressed image, but for the time being, they can happily coexist. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index dfa6cc8bbef9..2ca082e3f136 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -411,7 +411,6 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_TPM_FINAL_LOG_GUID EFI_GUID(0x1e2ed096, 0x30e2, 0x4254, 0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25) #define LINUX_EFI_MEMRESERVE_TABLE_GUID EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5, 0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2) #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) -#define LINUX_EFI_ZBOOT_MEDIA_GUID EFI_GUID(0xe565a30d, 0x47da, 0x4dbd, 0xb3, 0x54, 0x9b, 0xb5, 0xc8, 0x4f, 0x8b, 0xe2) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) #define LINUX_EFI_BOOT_MEMMAP_GUID EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) -- cgit From 2e2b4b896159f9d47d063ccf4ed0a7af9a40f1c5 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 19 Oct 2022 13:55:03 +0300 Subject: tty: Convert tty_buffer flags to bool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct tty_buffer has flags which is only used for storing TTYB_NORMAL. There is also a few quite confusing operations for checking the presense of TTYB_NORMAL. Simplify things by converting flags to bool. Despite the name remaining the same, the meaning of "flags" is altered slightly by this change. Previously it referred to flags of the buffer (only TTYB_NORMAL being used as a flag). After this change, flags tell whether the buffer contains/should be allocated with flags array along with character data array. It is much more suitable name that TTYB_NORMAL was for this purpose, thus the name remains. Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20221019105504.16800-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_buffer.h | 5 +---- include/linux/tty_flip.h | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_buffer.h b/include/linux/tty_buffer.h index 1796648c2907..6ceb2789e6c8 100644 --- a/include/linux/tty_buffer.h +++ b/include/linux/tty_buffer.h @@ -17,14 +17,11 @@ struct tty_buffer { int commit; int lookahead; /* Lazy update on recv, can become less than "read" */ int read; - int flags; + bool flags; /* Data points here */ unsigned long data[]; }; -/* Values for .flags field of tty_buffer */ -#define TTYB_NORMAL 1 /* buffer has no flags buffer */ - static inline unsigned char *char_buf_ptr(struct tty_buffer *b, int ofs) { return ((unsigned char *)b->data) + ofs; diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h index 483d41cbcbb7..bfaaeee61a05 100644 --- a/include/linux/tty_flip.h +++ b/include/linux/tty_flip.h @@ -25,9 +25,9 @@ static inline int tty_insert_flip_char(struct tty_port *port, struct tty_buffer *tb = port->buf.tail; int change; - change = (tb->flags & TTYB_NORMAL) && (flag != TTY_NORMAL); + change = !tb->flags && (flag != TTY_NORMAL); if (!change && tb->used < tb->size) { - if (~tb->flags & TTYB_NORMAL) + if (tb->flags) *flag_buf_ptr(tb, tb->used) = flag; *char_buf_ptr(tb, tb->used++) = ch; return 1; -- cgit From 2fe8e1dcf937272c5425e69947819894fcf077a6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 17:55:27 -0700 Subject: gpiolib: remove devm_fwnode_get_[index_]gpiod_from_child() Now that there are no more users of these APIs in the kernel we can remove them. Signed-off-by: Dmitry Torokhov Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 36460ced060b..45da8f137fe5 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -581,27 +581,6 @@ struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, flags, label); } -static inline -struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, - const char *con_id, int index, - struct fwnode_handle *child, - enum gpiod_flags flags, - const char *label) -{ - return devm_fwnode_gpiod_get_index(dev, child, con_id, index, - flags, label); -} - -static inline -struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev, - const char *con_id, - struct fwnode_handle *child, - enum gpiod_flags flags, - const char *label) -{ - return devm_fwnode_gpiod_get_index(dev, child, con_id, 0, flags, label); -} - #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_OF_GPIO) struct device_node; -- cgit From bd039b5ea2a91ea707ee8539df26456bd5be80af Mon Sep 17 00:00:00 2001 From: Andy Ren Date: Mon, 7 Nov 2022 09:42:42 -0800 Subject: net/core: Allow live renaming when an interface is up Allow a network interface to be renamed when the interface is up. As described in the netconsole documentation [1], when netconsole is used as a built-in, it will bring up the specified interface as soon as possible. As a result, user space will not be able to rename the interface since the kernel disallows renaming of interfaces that are administratively up unless the 'IFF_LIVE_RENAME_OK' private flag was set by the kernel. The original solution [2] to this problem was to add a new parameter to the netconsole configuration parameters that allows renaming of the interface used by netconsole while it is administratively up. However, during the discussion that followed, it became apparent that we have no reason to keep the current restriction and instead we should allow user space to rename interfaces regardless of their administrative state: 1. The restriction was put in place over 20 years ago when renaming was only possible via IOCTL and before rtnetlink started notifying user space about such changes like it does today. 2. The 'IFF_LIVE_RENAME_OK' flag was added over 3 years ago in version 5.2 and no regressions were reported. 3. In-kernel listeners to 'NETDEV_CHANGENAME' do not seem to care about the administrative state of interface. Therefore, allow user space to rename running interfaces by removing the restriction and the associated 'IFF_LIVE_RENAME_OK' flag. Help in possible triage by emitting a message to the kernel log that an interface was renamed while UP. [1] https://www.kernel.org/doc/Documentation/networking/netconsole.rst [2] https://lore.kernel.org/netdev/20221102002420.2613004-1-andy.ren@getcruise.com/ Signed-off-by: Andy Ren Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d45713a06568..4be87b89e481 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1650,7 +1650,6 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device - * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1686,7 +1685,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - IFF_LIVE_RENAME_OK = 1<<30, + /* was IFF_LIVE_RENAME_OK */ IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; @@ -1721,7 +1720,6 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER -#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK #define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ -- cgit From fa627348cfc7fb174468d88756b83c2d97890b07 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Oct 2022 18:54:26 +0200 Subject: driver core: class: make namespace and get_ownership take const * The callbacks in struct class namespace() and get_ownership() do not modify the struct device passed to them, so mark the pointer as constant and fix up all callbacks in the kernel to have the correct function signature. This helps make it more obvious what calls and callbacks do, and do not, modify structures passed to them. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20221001165426.2690912-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index e61ec5502019..20103e0b03c3 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -68,9 +68,9 @@ struct class { int (*shutdown_pre)(struct device *dev); const struct kobj_ns_type_operations *ns_type; - const void *(*namespace)(struct device *dev); + const void *(*namespace)(const struct device *dev); - void (*get_ownership)(struct device *dev, kuid_t *uid, kgid_t *gid); + void (*get_ownership)(const struct device *dev, kuid_t *uid, kgid_t *gid); const struct dev_pm_ops *pm; -- cgit From f733615e39aa2d6ddeef33b7b2c9aa6a5a2c2785 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 7 Nov 2022 15:21:59 +0106 Subject: rcu: Implement lockdep_rcu_enabled for !CONFIG_DEBUG_LOCK_ALLOC Provide an implementation for debug_lockdep_rcu_enabled() when CONFIG_DEBUG_LOCK_ALLOC is not enabled. This allows code to check if rcu lockdep debugging is available without needing an extra check if CONFIG_DEBUG_LOCK_ALLOC is enabled. Signed-off-by: John Ogness Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 08605ce7379d..65178c40ab6f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -340,6 +340,11 @@ static inline int rcu_read_lock_any_held(void) return !preemptible(); } +static inline int debug_lockdep_rcu_enabled(void) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU -- cgit From 07a368b3f55a79d33cad113d506b279e04fd2a00 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:27 +0300 Subject: bug: introduce ASSERT_STRUCT_OFFSET ASSERT_STRUCT_OFFSET allows to assert during the build of the kernel that a field in a struct have an expected offset. KVM used to have such macro, but there is almost nothing KVM specific in it so move it to build_bug.h, so that it can be used in other places in KVM. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-10-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/build_bug.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index e3a0be2c90ad..3aa3640f8c18 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -77,4 +77,13 @@ #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr) #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) + +/* + * Compile time check that field has an expected offset + */ +#define ASSERT_STRUCT_OFFSET(type, field, expected_offset) \ + BUILD_BUG_ON_MSG(offsetof(type, field) != (expected_offset), \ + "Offset of " #field " in " #type " has changed.") + + #endif /* _LINUX_BUILD_BUG_H */ -- cgit From 93c5c61d9e58a9ea423439d358c198be5b674a58 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:58:06 -0400 Subject: mm/gup: Add FOLL_INTERRUPTIBLE We have had FAULT_FLAG_INTERRUPTIBLE but it was never applied to GUPs. One issue with it is that not all GUP paths are able to handle signal delivers besides SIGKILL. That's not ideal for the GUP users who are actually able to handle these cases, like KVM. KVM uses GUP extensively on faulting guest pages, during which we've got existing infrastructures to retry a page fault at a later time. Allowing the GUP to be interrupted by generic signals can make KVM related threads to be more responsive. For examples: (1) SIGUSR1: which QEMU/KVM uses to deliver an inter-process IPI, e.g. when the admin issues a vm_stop QMP command, SIGUSR1 can be generated to kick the vcpus out of kernel context immediately, (2) SIGINT: which can be used with interactive hypervisor users to stop a virtual machine with Ctrl-C without any delays/hangs, (3) SIGTRAP: which grants GDB capability even during page faults that are stuck for a long time. Normally hypervisor will be able to receive these signals properly, but not if we're stuck in a GUP for a long time for whatever reason. It happens easily with a stucked postcopy migration when e.g. a network temp failure happens, then some vcpu threads can hang death waiting for the pages. With the new FOLL_INTERRUPTIBLE, we can allow GUP users like KVM to selectively enable the ability to trap these signals. Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Signed-off-by: Peter Xu Message-Id: <20221011195809.557016-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..3c84f4e48cd7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2958,6 +2958,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_INTERRUPTIBLE 0x100000 /* allow interrupts from generic signals */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each -- cgit From fe5ed56c79733b7808f968567c581118ab79552e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:58:07 -0400 Subject: kvm: Add KVM_PFN_ERR_SIGPENDING Add a new pfn error to show that we've got a pending signal to handle during hva_to_pfn_slow() procedure (of -EINTR retval). Signed-off-by: Peter Xu Reviewed-by: Sean Christopherson Message-Id: <20221011195809.557016-3-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 18592bdf4c1b..911b064878df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -96,6 +96,7 @@ #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) +#define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) /* * error pfns indicate that the gfn is in slot but faild to @@ -106,6 +107,15 @@ static inline bool is_error_pfn(kvm_pfn_t pfn) return !!(pfn & KVM_PFN_ERR_MASK); } +/* + * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted + * by a pending signal. Note, the signal may or may not be fatal. + */ +static inline bool is_sigpending_pfn(kvm_pfn_t pfn) +{ + return pfn == KVM_PFN_ERR_SIGPENDING; +} + /* * error_noslot pfns indicate that the gfn can not be * translated to pfn - it is not in slot or failed to -- cgit From c8b88b332bedf47a9aa008dfb69998c90623375c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:58:08 -0400 Subject: kvm: Add interruptible flag to __gfn_to_pfn_memslot() Add a new "interruptible" flag showing that the caller is willing to be interrupted by signals during the __gfn_to_pfn_memslot() request. Wire it up with a FOLL_INTERRUPTIBLE flag that we've just introduced. This prepares KVM to be able to respond to SIGUSR1 (for QEMU that's the SIGIPI) even during e.g. handling an userfaultfd page fault. No functional change intended. Signed-off-by: Peter Xu Reviewed-by: Sean Christopherson Message-Id: <20221011195809.557016-4-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 911b064878df..8fe4665bd020 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1150,8 +1150,8 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool *async, bool write_fault, - bool *writable, hva_t *hva); + bool atomic, bool interruptible, bool *async, + bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); -- cgit From d663b8a285986072428a6a145e5994bc275df994 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 3 Nov 2022 10:44:10 -0400 Subject: KVM: replace direct irq.h inclusion virt/kvm/irqchip.c is including "irq.h" from the arch-specific KVM source directory (i.e. not from arch/*/include) for the sole purpose of retrieving irqchip_in_kernel. Making the function inline in a header that is already included, such as asm/kvm_host.h, is not possible because it needs to look at struct kvm which is defined after asm/kvm_host.h is included. So add a kvm_arch_irqchip_in_kernel non-inline function; irqchip_in_kernel() is only performance critical on arm64 and x86, and the non-inline function is enough on all other architectures. irq.h can then be deleted from all architectures except x86. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8fe4665bd020..e6e66c5e56f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -663,6 +663,8 @@ struct kvm_irq_routing_table { */ struct hlist_head map[]; }; + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #endif #ifndef KVM_INTERNAL_MEM_SLOTS -- cgit From 0cda8c43aa2477b7a9f9bed0adff2f34d3afc143 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 9 Nov 2022 12:08:46 +0100 Subject: regulator: qcom_smd: Add PMR735a regulators PMR735a is already supported in the RPMH regulator driver, but there are cases where it's bundled with SMD RPM SoCs. Port it over to qcom_smd-regulator to enable usage in such cases. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20221109110846.45789-2-konrad.dybcio@linaro.org Signed-off-by: Mark Brown --- include/linux/soc/qcom/smd-rpm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h index 3ab8c07f71c0..62de54992e49 100644 --- a/include/linux/soc/qcom/smd-rpm.h +++ b/include/linux/soc/qcom/smd-rpm.h @@ -19,6 +19,7 @@ struct qcom_smd_rpm; #define QCOM_SMD_RPM_CLK_BUF_A 0x616B6C63 #define QCOM_SMD_RPM_LDOA 0x616f646c #define QCOM_SMD_RPM_LDOB 0x626F646C +#define QCOM_SMD_RPM_LDOE 0x656f646c #define QCOM_SMD_RPM_RWCX 0x78637772 #define QCOM_SMD_RPM_RWMX 0x786d7772 #define QCOM_SMD_RPM_RWLC 0x636c7772 @@ -32,6 +33,7 @@ struct qcom_smd_rpm; #define QCOM_SMD_RPM_QUP_CLK 0x707571 #define QCOM_SMD_RPM_SMPA 0x61706d73 #define QCOM_SMD_RPM_SMPB 0x62706d73 +#define QCOM_SMD_RPM_SMPE 0x65706d73 #define QCOM_SMD_RPM_SPDM 0x63707362 #define QCOM_SMD_RPM_VSA 0x00617376 #define QCOM_SMD_RPM_MMAXI_CLK 0x69786d6d -- cgit From 9beccca0984022a844850e32f0d7dd80d4a225de Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 27 Oct 2022 17:59:07 +0200 Subject: scs: add support for dynamic shadow call stacks In order to allow arches to use code patching to conditionally emit the shadow stack pushes and pops, rather than always taking the performance hit even on CPUs that implement alternatives such as stack pointer authentication on arm64, add a Kconfig symbol that can be set by the arch to omit the SCS codegen itself, without otherwise affecting how support code for SCS and compiler options (for register reservation, for instance) are emitted. Also, add a static key and some plumbing to omit the allocation of shadow call stack for dynamic SCS configurations if SCS is disabled at runtime. Signed-off-by: Ard Biesheuvel Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20221027155908.1940624-3-ardb@kernel.org Signed-off-by: Will Deacon --- include/linux/scs.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scs.h b/include/linux/scs.h index 18122d9e17ff..4ab5bdc898cf 100644 --- a/include/linux/scs.h +++ b/include/linux/scs.h @@ -53,6 +53,22 @@ static inline bool task_scs_end_corrupted(struct task_struct *tsk) return sz >= SCS_SIZE - 1 || READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC; } +DECLARE_STATIC_KEY_FALSE(dynamic_scs_enabled); + +static inline bool scs_is_dynamic(void) +{ + if (!IS_ENABLED(CONFIG_DYNAMIC_SCS)) + return false; + return static_branch_likely(&dynamic_scs_enabled); +} + +static inline bool scs_is_enabled(void) +{ + if (!IS_ENABLED(CONFIG_DYNAMIC_SCS)) + return true; + return scs_is_dynamic(); +} + #else /* CONFIG_SHADOW_CALL_STACK */ static inline void *scs_alloc(int node) { return NULL; } @@ -62,6 +78,8 @@ static inline void scs_task_reset(struct task_struct *tsk) {} static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } static inline void scs_release(struct task_struct *tsk) {} static inline bool task_scs_end_corrupted(struct task_struct *tsk) { return false; } +static inline bool scs_is_enabled(void) { return false; } +static inline bool scs_is_dynamic(void) { return false; } #endif /* CONFIG_SHADOW_CALL_STACK */ -- cgit From a5be5ce0e25439fae3cd42e3d775979547926812 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Thu, 3 Nov 2022 09:25:29 +0100 Subject: firmware/nvram: bcm47xx: support init from IO memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide NVMEM content to the NVRAM driver from a simple memory resource. This is necessary to use NVRAM in a memory- mapped flash device. Patch taken from OpenWrts development tree. This patch makes it possible to use memory-mapped NVRAM on the D-Link DWL-8610AP and the D-Link DIR-890L. Cc: Hauke Mehrtens Cc: linux-mips@vger.kernel.org Cc: Florian Fainelli Cc: bcm-kernel-feedback-list@broadcom.com Cc: Thomas Bogendoerfer Signed-off-by: Rafał Miłecki [Added an export for modules potentially using the init symbol] Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20221103082529.359084-1-linus.walleij@linaro.org Signed-off-by: Florian Fainelli --- include/linux/bcm47xx_nvram.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h index 53b31f69b74a..7615f8d7b1ed 100644 --- a/include/linux/bcm47xx_nvram.h +++ b/include/linux/bcm47xx_nvram.h @@ -11,6 +11,7 @@ #include #ifdef CONFIG_BCM47XX_NVRAM +int bcm47xx_nvram_init_from_iomem(void __iomem *nvram_start, size_t res_size); int bcm47xx_nvram_init_from_mem(u32 base, u32 lim); int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len); int bcm47xx_nvram_gpio_pin(const char *name); @@ -20,6 +21,11 @@ static inline void bcm47xx_nvram_release_contents(char *nvram) vfree(nvram); }; #else +static inline int bcm47xx_nvram_init_from_iomem(void __iomem *nvram_start, + size_t res_size) +{ + return -ENOTSUPP; +} static inline int bcm47xx_nvram_init_from_mem(u32 base, u32 lim) { return -ENOTSUPP; -- cgit From 0f0892356fa174bdd8bd655c820ee3658c4c9f01 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:08 -0600 Subject: mm: allow multiple error returns in try_grab_page() In order to add checks for P2PDMA memory into try_grab_page(), expand the error return from a bool to an int/error code. Update all the callsites handle change in usage. Also remove the WARN_ON_ONCE() call at the callsites seeing there already is a WARN_ON_ONCE() inside the function if it fails. Signed-off-by: Logan Gunthorpe Reviewed-by: Dan Williams Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221021174116.7200-2-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..62a91dc1272b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1129,7 +1129,7 @@ static inline void get_page(struct page *page) folio_get(page_folio(page)); } -bool __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_page(struct page *page, unsigned int flags); static inline __must_check bool try_get_page(struct page *page) { -- cgit From 4003f107fa2eabb0aab90e37a1ed7b74c6f0d132 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:09 -0600 Subject: mm: introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages GUP Callers that expect PCI P2PDMA pages can now set FOLL_PCI_P2PDMA to allow obtaining P2PDMA pages. If GUP is called without the flag and a P2PDMA page is found, it will return an error in try_grab_page() or try_grab_folio(). The check is safe to do before taking the reference to the page in both cases seeing the page should be protected by either the appropriate ptl or mmap_lock; or the gup fast guarantees preventing TLB flushes. try_grab_folio() has one call site that WARNs on failure and cannot actually deal with the failure of this function (it seems it will get into an infinite loop). Expand the comment there to document a couple more conditions on why it will not fail. FOLL_PCI_P2PDMA cannot be set if FOLL_LONGTERM is set. This is to copy fsdax until pgmap refcounts are fixed (see the link below for more information). Link: https://lkml.kernel.org/r/Yy4Ot5MoOhsgYLTQ@ziepe.ca Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-3-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 62a91dc1272b..6b081a8dcf88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2958,6 +2958,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each -- cgit From d82076403cef7fcd1e7617c9db48bf21ebdc1f9c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:10 -0600 Subject: iov_iter: introduce iov_iter_get_pages_[alloc_]flags() Add iov_iter_get_pages_flags() and iov_iter_get_pages_alloc_flags() which take a flags argument that is passed to get_user_pages_fast(). This is so that FOLL_PCI_P2PDMA can be passed when appropriate. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221021174116.7200-4-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e3134b14ffd..9ede533ce64c 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -247,8 +247,14 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); +ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, + size_t maxsize, unsigned maxpages, size_t *start, + unsigned gup_flags); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, size_t *start, + unsigned gup_flags); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); -- cgit From 49580e690755d0e51ed7aa2c33225dd884fa738a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:11 -0600 Subject: block: add check when merging zone device pages Consecutive zone device pages should not be merged into the same sgl or bvec segment with other types of pages or if they belong to different pgmaps. Otherwise getting the pgmap of a given segment is not possible without scanning the entire segment. This helper returns true either if both pages are not zone device pages or both pages are zone device pages with the same pgmap. Add a helper to determine if zone device pages are mergeable and use this helper in page_is_mergeable(). Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-5-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/mmzone.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..9c49ec5d0e25 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -986,6 +986,25 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } + +/* + * Consecutive zone device pages should not be merged into the same sgl + * or bvec segment with other types of pages or if they belong to different + * pgmaps. Otherwise getting the pgmap of a given segment is not possible + * without scanning the entire segment. This helper returns true either if + * both pages are not zone device pages or both pages are zone device pages + * with the same pgmap. + */ +static inline bool zone_device_pages_have_same_pgmap(const struct page *a, + const struct page *b) +{ + if (is_zone_device_page(a) != is_zone_device_page(b)) + return false; + if (!is_zone_device_page(a)) + return true; + return a->pgmap == b->pgmap; +} + extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else @@ -993,6 +1012,11 @@ static inline bool is_zone_device_page(const struct page *page) { return false; } +static inline bool zone_device_pages_have_same_pgmap(const struct page *a, + const struct page *b) +{ + return true; +} #endif static inline bool folio_is_zone_device(const struct folio *folio) -- cgit From 3e52fba03a20234abc65a656cef063a1045d9723 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Nov 2022 14:22:06 +0100 Subject: net: introduce a helper to move notifier block to different namespace Currently, net_dev() netdev notifier variant follows the netdev with per-net notifier from namespace to namespace. This is implemented by move_netdevice_notifiers_dev_net() helper. For devlink it is needed to re-register per-net notifier during devlink reload. Introduce a new helper called move_netdevice_notifier_net() and share the unregister/register code with existing move_netdevice_notifiers_dev_net() helper. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4be87b89e481..02a2318da7c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2826,6 +2826,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, + struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); -- cgit From 1498c503e19e587dbc26c8827633960a67594359 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Tue, 18 Oct 2022 17:28:35 +0200 Subject: PM: domains: Store the next hrtimer wakeup in genpd The arch timer cannot wake up the Qualcomm Technologies, Inc. (QTI) SoCs from the deeper CPUidle states. To be able to wakeup from these deeper states, another always-on timer needs to be programmed through the so called CONTROL_TCS. As the RSC is part of CPU subsystem and the corresponding APSS RSC device is attached to the cluster PM domain (through genpd), it holds the responsibility to program the always-on timer, before entering any of these deeper CPUidle states. However, programming the timer requires information about the next hrtimer wakeup for the cluster PM domain, which is currently only known by genpd. Therefore, let's share this data through a new genpd helper function, dev_pm_genpd_get_next_hrtimer(). Signed-off-by: Maulik Shah Cc: "Rafael J. Wysocki" [Ulf: Reworked the code and updated the commit message] Signed-off-by: Ulf Hansson Tested-by: Dmitry Baryshkov # SM8450 Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20221018152837.619426-5-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index ebc351698090..1cd41bdf73cf 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Flags to control the behaviour of a genpd. @@ -95,6 +96,7 @@ struct genpd_governor_data { s64 max_off_time_ns; bool max_off_time_changed; ktime_t next_wakeup; + ktime_t next_hrtimer; bool cached_power_down_ok; bool cached_power_down_state_idx; }; @@ -232,6 +234,7 @@ int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state); int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb); int dev_pm_genpd_remove_notifier(struct device *dev); void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next); +ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; @@ -293,6 +296,10 @@ static inline int dev_pm_genpd_remove_notifier(struct device *dev) static inline void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next) { } +static inline ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev) +{ + return KTIME_MAX; +} #define simple_qos_governor (*(struct dev_power_governor *)(NULL)) #define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL)) #endif -- cgit From f6479ea4e5990e17b9690d66474198dcdba1b2c1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 8 Nov 2022 14:25:56 +0000 Subject: net: mdio: add mdiodev_c45_(read|write) Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 00177567cfef..f7fbbf3069e7 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -488,6 +488,19 @@ static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val); } +static inline int mdiodev_c45_read(struct mdio_device *mdiodev, int devad, + u16 regnum) +{ + return mdiobus_c45_read(mdiodev->bus, mdiodev->addr, devad, regnum); +} + +static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad, + u16 regnum, u16 val) +{ + return mdiobus_c45_write(mdiodev->bus, mdiodev->addr, devad, regnum, + val); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); -- cgit From 802e19a066c40017321a0717817785d861aedc53 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 9 Nov 2022 17:23:56 +0200 Subject: pinctrl: Put space between type and data in compound literal It's slightly better to read when compound literal data and type are separated by a space. Signed-off-by: Andy Shevchenko Reviewed-by: Basavaraj Natikar Link: https://lore.kernel.org/r/20221109152356.39868-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 31fe992412f0..a0d39b303431 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -40,7 +40,7 @@ struct pingroup { /* Convenience macro to define a single named or anonymous pingroup */ #define PINCTRL_PINGROUP(_name, _pins, _npins) \ -(struct pingroup){ \ +(struct pingroup) { \ .name = _name, \ .pins = _pins, \ .npins = _npins, \ -- cgit From 8dd7e4af585331dda004e92ed0739c3609e37177 Mon Sep 17 00:00:00 2001 From: Benedikt Niedermayr Date: Wed, 9 Nov 2022 11:24:54 +0100 Subject: memory: omap-gpmc: fix coverity issue "Control flow issues" Assign a big positive integer instead of an negative integer to an u32 variable. Also remove the check for ">= 0" which doesn't make sense for unsigned integers. Reported-by: coverity-bot Addresses-Coverity-ID: 1527139 ("Control flow issues") Fixes: 89aed3cd5cb9 ("memory: omap-gpmc: wait pin additions") Signed-off-by: Benedikt Niedermayr Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20221109102454.174320-1-benedikt.niedermayr@siemens.com Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/gpmc-omap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h index 296b080c5c67..dcca6c5e23bb 100644 --- a/include/linux/platform_data/gpmc-omap.h +++ b/include/linux/platform_data/gpmc-omap.h @@ -137,11 +137,11 @@ struct gpmc_device_timings { #define GPMC_MUX_AD 2 /* Addr-Data multiplex */ /* Wait pin polarity values */ -#define GPMC_WAITPINPOLARITY_INVALID -1 +#define GPMC_WAITPINPOLARITY_INVALID UINT_MAX #define GPMC_WAITPINPOLARITY_ACTIVE_LOW 0 #define GPMC_WAITPINPOLARITY_ACTIVE_HIGH 1 -#define GPMC_WAITPIN_INVALID -1 +#define GPMC_WAITPIN_INVALID UINT_MAX struct gpmc_settings { bool burst_wrap; /* enables wrap bursting */ -- cgit From 30f89e524becdbaa483b34902b079c9d4dfaa4a3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 2 Nov 2022 08:47:11 +0100 Subject: x86/cacheinfo: Switch cache_ap_init() to hotplug callback Instead of explicitly calling cache_ap_init() in identify_secondary_cpu() use a CPU hotplug callback instead. By registering the callback only after having started the non-boot CPUs and initializing cache_aps_delayed_init with "true", calling set_cache_aps_delayed_init() at boot time can be dropped. It should be noted that this change results in cache_ap_init() being called a little bit later when hotplugging CPUs. By using a new hotplug slot right at the start of the low level bringup this is not problematic, as no operations requiring a specific caching mode are performed that early in CPU initialization. Suggested-by: Borislav Petkov Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221102074713.21493-15-jgross@suse.com Signed-off-by: Borislav Petkov --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f61447913db9..0d277b4b025a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -140,6 +140,7 @@ enum cpuhp_state { */ CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, + CPUHP_AP_CACHECTRL_STARTING, CPUHP_AP_SCHED_STARTING, CPUHP_AP_RCUTREE_DYING, CPUHP_AP_CPU_PM_STARTING, -- cgit From cf87ac739e488055a6046a410caa8f4da108948f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:08 +0800 Subject: KVM: x86: Introduce KVM_REQ_DIRTY_RING_SOFT_FULL The VCPU isn't expected to be runnable when the dirty ring becomes soft full, until the dirty pages are harvested and the dirty ring is reset from userspace. So there is a check in each guest's entrace to see if the dirty ring is soft full or not. The VCPU is stopped from running if its dirty ring has been soft full. The similar check will be needed when the feature is going to be supported on ARM64. As Marc Zyngier suggested, a new event will avoid pointless overhead to check the size of the dirty ring ('vcpu->kvm->dirty_ring_size') in each guest's entrance. Add KVM_REQ_DIRTY_RING_SOFT_FULL. The event is raised when the dirty ring becomes soft full in kvm_dirty_ring_push(). The event is only cleared in the check, done in the newly added helper kvm_dirty_ring_check_request(). Since the VCPU is not runnable when the dirty ring becomes soft full, the KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent the VCPU from running until the dirty pages are harvested and the dirty ring is reset by userspace. kvm_dirty_ring_soft_full() becomes a private function with the newly added helper kvm_dirty_ring_check_request(). The alignment for the various event definitions in kvm_host.h is changed to tab character by the way. In order to avoid using 'container_of()', the argument @ring is replaced by @vcpu in kvm_dirty_ring_push(). Link: https://lore.kernel.org/kvmarm/87lerkwtm5.wl-maz@kernel.org Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Peter Xu Reviewed-by: Sean Christopherson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-2-gshan@redhat.com --- include/linux/kvm_dirty_ring.h | 12 ++++-------- include/linux/kvm_host.h | 9 +++++---- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 906f899813dc..9c13c4c3d30c 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -49,7 +49,7 @@ static inline int kvm_dirty_ring_reset(struct kvm *kvm, return 0; } -static inline void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, +static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) { } @@ -64,11 +64,6 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) { } -static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) -{ - return true; -} - #else /* CONFIG_HAVE_KVM_DIRTY_RING */ u32 kvm_dirty_ring_get_rsvd_entries(void); @@ -84,13 +79,14 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring); * returns =0: successfully pushed * <0: unable to push, need to wait */ -void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset); +void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset); + +bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu); /* for use in vm_operations_struct */ struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset); void kvm_dirty_ring_free(struct kvm_dirty_ring *ring); -bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring); #endif /* CONFIG_HAVE_KVM_DIRTY_RING */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 00c3448ba7f8..648d663f32c4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -153,10 +153,11 @@ static inline bool is_error_page(struct page *page) * Architecture-independent vcpu->requests bit members * Bits 3-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_UNBLOCK 2 -#define KVM_REQUEST_ARCH_BASE 8 +#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UNBLOCK 2 +#define KVM_REQ_DIRTY_RING_SOFT_FULL 3 +#define KVM_REQUEST_ARCH_BASE 8 /* * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to -- cgit From e8a18565e59303ac12c626a161d72bd890bd2062 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:09 +0800 Subject: KVM: Move declaration of kvm_cpu_dirty_log_size() to kvm_dirty_ring.h Not all architectures like ARM64 need to override the function. Move its declaration to kvm_dirty_ring.h to avoid the following compiling warning on ARM64 when the feature is enabled. arch/arm64/kvm/../../../virt/kvm/dirty_ring.c:14:12: \ warning: no previous prototype for 'kvm_cpu_dirty_log_size' \ [-Wmissing-prototypes] \ int __weak kvm_cpu_dirty_log_size(void) Reported-by: kernel test robot Signed-off-by: Gavin Shan Reviewed-by: Peter Xu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-3-gshan@redhat.com --- include/linux/kvm_dirty_ring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 9c13c4c3d30c..199ead37b104 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -66,6 +66,7 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ +int kvm_cpu_dirty_log_size(void); u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); -- cgit From 86bdf3ebcfe1ded055282536fecce13001874740 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:10 +0800 Subject: KVM: Support dirty ring in conjunction with bitmap ARM64 needs to dirty memory outside of a VCPU context when VGIC/ITS is enabled. It's conflicting with that ring-based dirty page tracking always requires a running VCPU context. Introduce a new flavor of dirty ring that requires the use of both VCPU dirty rings and a dirty bitmap. The expectation is that for non-VCPU sources of dirty memory (such as the VGIC/ITS on arm64), KVM writes to the dirty bitmap. Userspace should scan the dirty bitmap before migrating the VM to the target. Use an additional capability to advertise this behavior. The newly added capability (KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP) can't be enabled before KVM_CAP_DIRTY_LOG_RING_ACQ_REL on ARM64. In this way, the newly added capability is treated as an extension of KVM_CAP_DIRTY_LOG_RING_ACQ_REL. Suggested-by: Marc Zyngier Suggested-by: Peter Xu Co-developed-by: Oliver Upton Signed-off-by: Oliver Upton Signed-off-by: Gavin Shan Acked-by: Peter Xu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-4-gshan@redhat.com --- include/linux/kvm_dirty_ring.h | 7 +++++++ include/linux/kvm_host.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 199ead37b104..4862c98d80d3 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -37,6 +37,11 @@ static inline u32 kvm_dirty_ring_get_rsvd_entries(void) return 0; } +static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) +{ + return true; +} + static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) { @@ -67,6 +72,8 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ int kvm_cpu_dirty_log_size(void); +bool kvm_use_dirty_bitmap(struct kvm *kvm); +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 648d663f32c4..db83f63f4e61 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -779,6 +779,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool dirty_ring_with_bitmap; bool vm_bugged; bool vm_dead; -- cgit From 5d1ba31087627423dfb2bd87badd62361701997b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 21 Oct 2022 11:24:04 +0800 Subject: mm: kasan: Extend kasan_metadata_size() to also cover in-object size When kasan is enabled for slab/slub, it may save kasan' free_meta data in the former part of slab object data area in slab object's free path, which works fine. There is ongoing effort to extend slub's debug function which will redzone the latter part of kmalloc object area, and when both of the debug are enabled, there is possible conflict, especially when the kmalloc object has small size, as caught by 0Day bot [1]. To solve it, slub code needs to know the in-object kasan's meta data size. Currently, there is existing kasan_metadata_size() which returns the kasan's metadata size inside slub's metadata area, so extend it to also cover the in-object meta size by adding a boolean flag 'in_object'. There is no functional change to existing code logic. [1]. https://lore.kernel.org/lkml/YuYm3dWwpZwH58Hu@xsang-OptiPlex-9020/ Reported-by: kernel test robot Suggested-by: Andrey Konovalov Signed-off-by: Feng Tang Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d811b3d7d2a1..96c9d56e5510 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -302,7 +302,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC -size_t kasan_metadata_size(struct kmem_cache *cache); +size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); @@ -315,7 +315,8 @@ void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ /* Tag-based KASAN modes do not use per-object metadata. */ -static inline size_t kasan_metadata_size(struct kmem_cache *cache) +static inline size_t kasan_metadata_size(struct kmem_cache *cache, + bool in_object) { return 0; } -- cgit From be7e8b917ead54754cc14b6c03769c8738a3f3f3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 9 Nov 2022 15:48:43 +0100 Subject: blkdev: make struct block_device_operations.devnode() take a const * The devnode() callback in struct block_device_operations should not be modifying the device that is passed into it, so mark it as a const * and propagate the function signature changes out into the one subsystem that actually uses this callback. Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20221109144843.679668-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50e358a19d98..2a455793462b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1413,7 +1413,7 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); - char *(*devnode)(struct gendisk *disk, umode_t *mode); + char *(*devnode)(const struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); -- cgit From 927bdd1e65bd14ae035d9c625df2f4ccd51e8a83 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 9 Nov 2022 15:07:10 +0100 Subject: driver core: remove devm_device_remove_groups() There is no in-kernel user of this function, so it is not needed anymore and can be removed. Cc: Dmitry Torokhov Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221109140711.105222-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 023ea50b1916..4efc607c008c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1062,8 +1062,6 @@ static inline void device_remove_group(struct device *dev, int __must_check devm_device_add_groups(struct device *dev, const struct attribute_group **groups); -void devm_device_remove_groups(struct device *dev, - const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); void devm_device_remove_group(struct device *dev, -- cgit From 0f0605d550ed986279030d452c7ed10df34da449 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 9 Nov 2022 15:07:11 +0100 Subject: driver core: remove devm_device_remove_group() There is no in-kernel user of this function, so it is not needed anymore and can be removed. Cc: Dmitry Torokhov Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221109140711.105222-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 4efc607c008c..84ae52de6746 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1064,8 +1064,6 @@ int __must_check devm_device_add_groups(struct device *dev, const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); -void devm_device_remove_group(struct device *dev, - const struct attribute_group *grp); /* * Platform "fixup" functions - allow the platform to have their say -- cgit From 52c4d11f1dce60453ab2a75fd7103118cedb2b58 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 9 Nov 2022 17:56:18 +0200 Subject: resource: Convert DEFINE_RES_NAMED() to be compound literal Currently DEFINE_RES_NAMED() can only be used to fill the static data. In some cases it would be convenient to use it as right value in the assignment operation. But it can't be done as is, because compiler has no clue about the data layout. Converting it to be a compound literal allows the above mentioned usage. Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221109155618.42276-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/ioport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 27642ca15d93..67d3fb2133b6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -155,7 +155,7 @@ enum { /* helpers to define resources */ #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ - { \ +(struct resource) { \ .start = (_start), \ .end = (_start) + (_size) - 1, \ .name = (_name), \ -- cgit From d1104f9327df9b26901b97cd026949f80ccab0d3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:06 +0100 Subject: vfio/ccw: replace vfio_init_device with _alloc_ Now that we have a reasonable separation of structs that follow the subchannel and mdev lifecycles, there's no reason we can't call the official vfio_alloc_device routine for our private data, and behave like everyone else. Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-7-farman@linux.ibm.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd..ba809268a48e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,8 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { -- cgit From 913447d06f032a9e9c84870bec0b1adb8c588f29 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:07 +0100 Subject: vfio: Remove vfio_free_device With the "mess" sorted out, we should be able to inline the vfio_free_device call introduced by commit cb9ff3f3b84c ("vfio: Add helpers for unifying vfio_device life cycle") and remove them from driver release callbacks. Signed-off-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Reviewed-by: Tony Krowiak # vfio-ap part Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-8-farman@linux.ibm.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ba809268a48e..e7480154825e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,7 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { put_device(&device->device); -- cgit From cc514101a97e3fb48f8617c8f291db798d10d831 Mon Sep 17 00:00:00 2001 From: Sujuan Chen Date: Sat, 5 Nov 2022 23:36:18 +0100 Subject: net: ethernet: mtk_wed: introduce wed mcu support Introduce WED mcu support used to configure WED WO chip. This is a preliminary patch in order to add RX Wireless Ethernet Dispatch available on MT7986 SoC. Tested-by: Daniel Golle Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Sujuan Chen Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 4450c8b7a1cb..2cc2f1e43ba9 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -11,6 +11,35 @@ struct mtk_wed_hw; struct mtk_wdma_desc; +enum mtk_wed_wo_cmd { + MTK_WED_WO_CMD_WED_CFG, + MTK_WED_WO_CMD_WED_RX_STAT, + MTK_WED_WO_CMD_RRO_SER, + MTK_WED_WO_CMD_DBG_INFO, + MTK_WED_WO_CMD_DEV_INFO, + MTK_WED_WO_CMD_BSS_INFO, + MTK_WED_WO_CMD_STA_REC, + MTK_WED_WO_CMD_DEV_INFO_DUMP, + MTK_WED_WO_CMD_BSS_INFO_DUMP, + MTK_WED_WO_CMD_STA_REC_DUMP, + MTK_WED_WO_CMD_BA_INFO_DUMP, + MTK_WED_WO_CMD_FBCMD_Q_DUMP, + MTK_WED_WO_CMD_FW_LOG_CTRL, + MTK_WED_WO_CMD_LOG_FLUSH, + MTK_WED_WO_CMD_CHANGE_STATE, + MTK_WED_WO_CMD_CPU_STATS_ENABLE, + MTK_WED_WO_CMD_CPU_STATS_DUMP, + MTK_WED_WO_CMD_EXCEPTION_INIT, + MTK_WED_WO_CMD_PROF_CTRL, + MTK_WED_WO_CMD_STA_BA_DUMP, + MTK_WED_WO_CMD_BA_CTRL_DUMP, + MTK_WED_WO_CMD_RXCNT_CTRL, + MTK_WED_WO_CMD_RXCNT_INFO, + MTK_WED_WO_CMD_SET_CAP, + MTK_WED_WO_CMD_CCIF_RING_DUMP, + MTK_WED_WO_CMD_WED_END +}; + enum mtk_wed_bus_tye { MTK_WED_BUS_PCIE, MTK_WED_BUS_AXI, -- cgit From 084d60ce0c6cef96024d53a58b92d7ff2d8b9318 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Nov 2022 23:36:20 +0100 Subject: net: ethernet: mtk_wed: rename tx_wdma array in rx_wdma Rename tx_wdma queue array in rx_wdma since this is rx side of wdma soc. Moreover rename mtk_wed_wdma_ring_setup routine in mtk_wed_wdma_rx_ring_setup() Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 2cc2f1e43ba9..956978320f8b 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -7,6 +7,7 @@ #include #define MTK_WED_TX_QUEUES 2 +#define MTK_WED_RX_QUEUES 2 struct mtk_wed_hw; struct mtk_wdma_desc; @@ -66,7 +67,7 @@ struct mtk_wed_device { struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; struct mtk_wed_ring txfree_ring; - struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES]; struct { int size; -- cgit From 4c5de09eb0d05fc9e73ff8f0e052622f1f3df6d8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Nov 2022 23:36:21 +0100 Subject: net: ethernet: mtk_wed: add configure wed wo support Enable RX Wireless Ethernet Dispatch available on MT7986 Soc. Tested-by: Daniel Golle Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 956978320f8b..8294978f4bca 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -5,10 +5,13 @@ #include #include #include +#include #define MTK_WED_TX_QUEUES 2 #define MTK_WED_RX_QUEUES 2 +#define WED_WO_STA_REC 0x6 + struct mtk_wed_hw; struct mtk_wdma_desc; @@ -41,21 +44,37 @@ enum mtk_wed_wo_cmd { MTK_WED_WO_CMD_WED_END }; +struct mtk_rxbm_desc { + __le32 buf0; + __le32 token; +} __packed __aligned(4); + enum mtk_wed_bus_tye { MTK_WED_BUS_PCIE, MTK_WED_BUS_AXI, }; +#define MTK_WED_RING_CONFIGURED BIT(0) struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; u32 desc_size; int size; + u32 flags; u32 reg_base; void __iomem *wpdma; }; +struct mtk_wed_wo_rx_stats { + __le16 wlan_idx; + __le16 tid; + __le32 rx_pkt_cnt; + __le32 rx_byte_cnt; + __le32 rx_err_cnt; + __le32 rx_drop_cnt; +}; + struct mtk_wed_device { #ifdef CONFIG_NET_MEDIATEK_SOC_WED const struct mtk_wed_ops *ops; @@ -64,9 +83,12 @@ struct mtk_wed_device { bool init_done, running; int wdma_idx; int irq; + u8 version; struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES]; struct mtk_wed_ring txfree_ring; + struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES]; struct { @@ -74,7 +96,20 @@ struct mtk_wed_device { void **pages; struct mtk_wdma_desc *desc; dma_addr_t desc_phys; - } buf_ring; + } tx_buf_ring; + + struct { + int size; + struct page_frag_cache rx_page; + struct mtk_rxbm_desc *desc; + dma_addr_t desc_phys; + } rx_buf_ring; + + struct { + struct mtk_wed_ring ring; + dma_addr_t miod_phys; + dma_addr_t fdbk_phys; + } rro; /* filled by driver: */ struct { @@ -83,22 +118,36 @@ struct mtk_wed_device { struct pci_dev *pci_dev; }; enum mtk_wed_bus_tye bus_type; + void __iomem *base; + u32 phy_base; u32 wpdma_phys; u32 wpdma_int; u32 wpdma_mask; u32 wpdma_tx; u32 wpdma_txfree; + u32 wpdma_rx_glo; + u32 wpdma_rx; + + bool wcid_512; u16 token_start; unsigned int nbuf; + unsigned int rx_nbuf; + unsigned int rx_npkt; + unsigned int rx_size; u8 tx_tbit[MTK_WED_TX_QUEUES]; + u8 rx_tbit[MTK_WED_RX_QUEUES]; u8 txfree_tbit; u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); int (*offload_enable)(struct mtk_wed_device *wed); void (*offload_disable)(struct mtk_wed_device *wed); + u32 (*init_rx_buf)(struct mtk_wed_device *wed, int size); + void (*release_rx_buf)(struct mtk_wed_device *wed); + void (*update_wo_rx_stats)(struct mtk_wed_device *wed, + struct mtk_wed_wo_rx_stats *stats); } wlan; #endif }; @@ -107,9 +156,15 @@ struct mtk_wed_ops { int (*attach)(struct mtk_wed_device *dev); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs); + int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, + void __iomem *regs); int (*txfree_ring_setup)(struct mtk_wed_device *dev, void __iomem *regs); + int (*msg_update)(struct mtk_wed_device *dev, int cmd_id, + void *data, int len); void (*detach)(struct mtk_wed_device *dev); + void (*ppe_check)(struct mtk_wed_device *dev, struct sk_buff *skb, + u32 reason, u32 hash); void (*stop)(struct mtk_wed_device *dev); void (*start)(struct mtk_wed_device *dev, u32 irq_mask); @@ -144,6 +199,16 @@ mtk_wed_device_attach(struct mtk_wed_device *dev) return ret; } +static inline bool +mtk_wed_get_rx_capa(struct mtk_wed_device *dev) +{ +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + return dev->version != 1; +#else + return false; +#endif +} + #ifdef CONFIG_NET_MEDIATEK_SOC_WED #define mtk_wed_device_active(_dev) !!(_dev)->ops #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) @@ -160,6 +225,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev) (_dev)->ops->irq_get(_dev, _mask) #define mtk_wed_device_irq_set_mask(_dev, _mask) \ (_dev)->ops->irq_set_mask(_dev, _mask) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \ + (_dev)->ops->rx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \ + (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) +#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ + (_dev)->ops->msg_update(_dev, _id, _msg, _len) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -173,6 +244,9 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) #define mtk_wed_device_irq_get(_dev, _mask) 0 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) +#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #endif #endif -- cgit From d5b560c014eddce747b2fd528d3071ded215b219 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 10 Nov 2022 00:47:42 +0300 Subject: ata: libata-sff: kill unused ata_sff_busy_sleep() Nobody seems to call ata_sff_busy_sleep(), so we can get rid of it... Signed-off-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index af4953b95f76..c9149ebe7423 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1918,8 +1918,6 @@ extern void ata_sff_dev_select(struct ata_port *ap, unsigned int device); extern u8 ata_sff_check_status(struct ata_port *ap); extern void ata_sff_pause(struct ata_port *ap); extern void ata_sff_dma_pause(struct ata_port *ap); -extern int ata_sff_busy_sleep(struct ata_port *ap, - unsigned long timeout_pat, unsigned long timeout); extern int ata_sff_wait_ready(struct ata_link *link, unsigned long deadline); extern void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf); extern void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf); -- cgit From 75ab70ec5cefade915a999886e4863fe6a38f8b5 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 9 Nov 2022 15:09:45 -0800 Subject: ptp: remove the .adjfreq interface function Now that all drivers have been converted to .adjfine, we can remove the .adjfreq from the interface structure. Signed-off-by: Jacob Keller Cc: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index f4781c5766d6..fdffa6a98d79 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -77,12 +77,6 @@ struct ptp_system_timestamp { * nominal frequency in parts per million, but with a * 16 bit binary fractional field. * - * @adjfreq: Adjusts the frequency of the hardware clock. - * This method is deprecated. New drivers should implement - * the @adjfine method instead. - * parameter delta: Desired frequency offset from nominal frequency - * in parts per billion - * * @adjphase: Adjusts the phase offset of the hardware clock. * parameter delta: Desired change in nanoseconds. * @@ -174,7 +168,6 @@ struct ptp_clock_info { int pps; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); - int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); -- cgit From 4f8126bb2308066b877859e4b5923ffb54143630 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sat, 5 Nov 2022 19:10:55 -0400 Subject: sbitmap: Use single per-bitmap counting to wake up queued tags sbitmap suffers from code complexity, as demonstrated by recent fixes, and eventual lost wake ups on nested I/O completion. The later happens, from what I understand, due to the non-atomic nature of the updates to wait_cnt, which needs to be subtracted and eventually reset when equal to zero. This two step process can eventually miss an update when a nested completion happens to interrupt the CPU in between the wait_cnt updates. This is very hard to fix, as shown by the recent changes to this code. The code complexity arises mostly from the corner cases to avoid missed wakes in this scenario. In addition, the handling of wake_batch recalculation plus the synchronization with sbq_queue_wake_up is non-trivial. This patchset implements the idea originally proposed by Jan [1], which removes the need for the two-step updates of wait_cnt. This is done by tracking the number of completions and wakeups in always increasing, per-bitmap counters. Instead of having to reset the wait_cnt when it reaches zero, we simply keep counting, and attempt to wake up N threads in a single wait queue whenever there is enough space for a batch. Waking up less than batch_wake shouldn't be a problem, because we haven't changed the conditions for wake up, and the existing batch calculation guarantees at least enough remaining completions to wake up a batch for each queue at any time. Performance-wise, one should expect very similar performance to the original algorithm for the case where there is no queueing. In both the old algorithm and this implementation, the first thing is to check ws_active, which bails out if there is no queueing to be managed. In the new code, we took care to avoid accounting completions and wakeups when there is no queueing, to not pay the cost of atomic operations unnecessarily, since it doesn't skew the numbers. For more interesting cases, where there is queueing, we need to take into account the cross-communication of the atomic operations. I've been benchmarking by running parallel fio jobs against a single hctx nullb in different hardware queue depth scenarios, and verifying both IOPS and queueing. Each experiment was repeated 5 times on a 20-CPU box, with 20 parallel jobs. fio was issuing fixed-size randwrites with qd=64 against nullb, varying only the hardware queue length per test. queue size 2 4 8 16 32 64 6.1-rc2 1681.1K (1.6K) 2633.0K (12.7K) 6940.8K (16.3K) 8172.3K (617.5K) 8391.7K (367.1K) 8606.1K (351.2K) patched 1721.8K (15.1K) 3016.7K (3.8K) 7543.0K (89.4K) 8132.5K (303.4K) 8324.2K (230.6K) 8401.8K (284.7K) The following is a similar experiment, ran against a nullb with a single bitmap shared by 20 hctx spread across 2 NUMA nodes. This has 40 parallel fio jobs operating on the same device queue size 2 4 8 16 32 64 6.1-rc2 1081.0K (2.3K) 957.2K (1.5K) 1699.1K (5.7K) 6178.2K (124.6K) 12227.9K (37.7K) 13286.6K (92.9K) patched 1081.8K (2.8K) 1316.5K (5.4K) 2364.4K (1.8K) 6151.4K (20.0K) 11893.6K (17.5K) 12385.6K (18.4K) It has also survived blktests and a 12h-stress run against nullb. I also ran the code against nvme and a scsi SSD, and I didn't observe performance regression in those. If there are other tests you think I should run, please let me know and I will follow up with results. [1] https://lore.kernel.org/all/aef9de29-e9f5-259a-f8be-12d1b734e72@google.com/ Cc: Hugh Dickins Cc: Keith Busch Cc: Liu Song Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20221105231055.25953-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 4d2d5205ab58..d662cf136021 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -86,11 +86,6 @@ struct sbitmap { * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { - /** - * @wait_cnt: Number of frees remaining before we wake up. - */ - atomic_t wait_cnt; - /** * @wait: Wait queue. */ @@ -138,6 +133,17 @@ struct sbitmap_queue { * sbitmap_queue_get_shallow() */ unsigned int min_shallow_depth; + + /** + * @completion_cnt: Number of bits cleared passed to the + * wakeup function. + */ + atomic_t completion_cnt; + + /** + * @wakeup_cnt: Number of thread wake ups issued. + */ + atomic_t wakeup_cnt; }; /** -- cgit From 3cd60866d46050d14734cbbac41b00c8d3e51d61 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 29 Sep 2022 14:10:39 +0200 Subject: module: remove redundant module_sysfs_initialized variable The variable module_sysfs_initialized is used for checking whether module_kset has been initialized. Checking module_kset itself works just fine for that. This is a leftover from commit 7405c1e15edf ("kset: convert /sys/module to use kset_create"). Signed-off-by: Rasmus Villemoes Reviewed-by: Miroslav Benes [mcgrof: adjusted commit log as suggested by Christophe Leroy] Signed-off-by: Luis Chamberlain --- include/linux/module.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ec61fb53979a..676614d56c25 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -827,7 +827,6 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr) #ifdef CONFIG_SYSFS extern struct kset *module_kset; extern struct kobj_type module_ktype; -extern int module_sysfs_initialized; #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) -- cgit From 354259fa73e2aac92ae5e19522adb69a92c15b49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Nov 2022 09:57:58 +0000 Subject: net: remove skb->vlan_present skb->vlan_present seems redundant. We can instead derive it from this boolean expression: vlan_present = skb->vlan_proto != 0 || skb->vlan_tci != 0 Add a new union, to access both fields in a single load/store when possible. union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; This allows following patch to remove a conditional test in GRO stack. Note: We move remcsum_offload to keep TC_AT_INGRESS_MASK and SKB_MONO_DELIVERY_TIME_MASK unchanged. Signed-off-by: Eric Dumazet Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 9 +++------ include/linux/skbuff.h | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e00c4ee81ff7..6864b89ef868 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -76,7 +76,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +#define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) @@ -471,7 +471,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { - skb->vlan_present = 0; + skb->vlan_all = 0; } /** @@ -483,9 +483,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { - dst->vlan_present = src->vlan_present; - dst->vlan_proto = src->vlan_proto; - dst->vlan_tci = src->vlan_tci; + dst->vlan_all = src->vlan_all; } /* @@ -519,7 +517,6 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; - skb->vlan_present = 1; } /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59c9fd55699d..4e464a27adaf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -818,7 +818,7 @@ typedef unsigned char *sk_buff_data_t; * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff - * @vlan_present: VLAN tag is present + * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) @@ -951,7 +951,7 @@ struct sk_buff { /* private: */ __u8 __pkt_vlan_present_offset[0]; /* public: */ - __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ + __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 dst_pending_confirm:1; @@ -966,7 +966,6 @@ struct sk_buff { __u8 ipvs_property:1; __u8 inner_protocol_type:1; - __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; @@ -999,8 +998,13 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; - __be16 vlan_proto; - __u16 vlan_tci; + union { + u32 vlan_all; + struct { + __be16 vlan_proto; + __u16 vlan_tci; + }; + }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; @@ -1059,15 +1063,13 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time +/* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define PKT_VLAN_PRESENT_BIT 7 #define TC_AT_INGRESS_MASK (1 << 0) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) #else -#define PKT_VLAN_PRESENT_BIT 0 #define TC_AT_INGRESS_MASK (1 << 7) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif -- cgit From 3b4c7bc01727e3a465759236eeac03d0dd686da3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 4 Nov 2022 13:52:42 +0100 Subject: xattr: use rbtree for simple_xattrs A while ago Vasily reported that it is possible to set a large number of xattrs on inodes of filesystems that make use of the simple xattr infrastructure. This includes all kernfs-based filesystems that support xattrs (e.g., cgroupfs and tmpfs). Both cgroupfs and tmpfs can be mounted by unprivileged users in unprivileged containers and root in an unprivileged container can set an unrestricted number of security.* xattrs and privileged users can also set unlimited trusted.* xattrs. As there are apparently users that have a fairly large number of xattrs we should scale a bit better. Other xattrs such as user.* are restricted for kernfs-based instances to a fairly limited number. Using a simple linked list protected by a spinlock used for set, get, and list operations doesn't scale well if users use a lot of xattrs even if it's not a crazy number. There's no need to bring in the big guns like rhashtables or rw semaphores for this. An rbtree with a rwlock, or limited rcu semanics and seqlock is enough. It scales within the constraints we are working in. By far the most common operation is getting an xattr. Setting xattrs should be a moderately rare operation. And listxattr() often only happens when copying xattrs between files or together with the contents to a new file. Holding a lock across listxattr() is unproblematic because it doesn't list the values of xattrs. It can only be used to list the names of all xattrs set on a file. And the number of xattr names that can be listed with listxattr() is limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names are found it will return -E2BIG. In short, the maximum amount of memory that can be retrieved via listxattr() is limited. Of course, the API is broken as documented on xattr(7) already. In the future we might want to address this but for now this is the world we live in and have lived for a long time. But it does indeed mean that once an application goes over XATTR_LIST_MAX limit of xattrs set on an inode it isn't possible to copy the file and include its xattrs in the copy unless the caller knows all xattrs or limits the copy of the xattrs to important ones it knows by name (At least for tmpfs, and kernfs-based filesystems. Other filesystems might provide ways of achieving this.). Bonus of this port to rbtree+rwlock is that we shrink the memory consumption for users of the simple xattr infrastructure. Also add proper kernel documentation to all the functions. A big thanks to Paul for his comments. Cc: Vasily Averin Cc: "Paul E. McKenney" Acked-by: Roman Gushchin Acked-by: Paul E. McKenney Signed-off-by: Christian Brauner (Microsoft) --- include/linux/xattr.h | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 4c379d23ec6e..b559c6bbcad0 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -80,48 +80,28 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - struct list_head head; - spinlock_t lock; + struct rb_root rb_root; + rwlock_t lock; }; struct simple_xattr { - struct list_head list; + struct rb_node rb_node; char *name; size_t size; char value[]; }; -/* - * initialize the simple_xattrs structure - */ -static inline void simple_xattrs_init(struct simple_xattrs *xattrs) -{ - INIT_LIST_HEAD(&xattrs->head); - spin_lock_init(&xattrs->lock); -} - -/* - * free all the xattrs - */ -static inline void simple_xattrs_free(struct simple_xattrs *xattrs) -{ - struct simple_xattr *xattr, *node; - - list_for_each_entry_safe(xattr, node, &xattrs->head, list) { - kfree(xattr->name); - kvfree(xattr); - } -} - +void simple_xattrs_init(struct simple_xattrs *xattrs); +void simple_xattrs_free(struct simple_xattrs *xattrs); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags, ssize_t *removed_size); -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, - size_t size); -void simple_xattr_list_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr); +ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, + char *buffer, size_t size); +void simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr); #endif /* _LINUX_XATTR_H */ -- cgit From 2c925db0a7d69b404d6bfe4c037935c2d367913d Mon Sep 17 00:00:00 2001 From: Ofer Levi Date: Tue, 9 Feb 2021 17:48:11 +0200 Subject: net/mlx5e: Support enhanced CQE compression CQE compression feature improves performance by reducing PCI bandwidth bottleneck on CQEs write. Enhanced CQE compression introduced in ConnectX-6 and it aims to reduce CPU utilization of SW side packets decompression by eliminating the need to rewrite ownership bit, which is likely to cost a cache-miss, is replaced by validity byte handled solely by HW. Another advantage of the enhanced feature is that session packets are available to SW as soon as a single CQE slot is filled, instead of waiting for session to close, this improves packet latency from NIC to host. Performance: Following are tested scenarios and reults comparing basic and enahnced CQE compression. setup: IXIA 100GbE connected directly to port 0 and port 1 of ConnectX-6 Dx 100GbE dual port. Case #1 RX only, single flow goes to single queue: IRQ rate reduced by ~ 30%, CPU utilization improved by 2%. Case #2 IP forwarding from port 1 to port 0 single flow goes to single queue: Avg latency improved from 60us to 21us, frame loss improved from 0.5% to 0.0%. Case #3 IP forwarding from port 1 to port 0 Max Throughput IXIA sends 100%, 8192 UDP flows, goes to 24 queues: Enhanced is equal or slightly better than basic. Testing the basic compression feature with this patch shows there is no perfrormance degradation of the basic compression feature. Signed-off-by: Ofer Levi Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 1ff91cb79ded..eb3fac30488b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -882,6 +882,12 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) return cqe->op_own >> 4; } +static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe) +{ + /* num_of_mini_cqes is zero based */ + return get_cqe_opcode(cqe) + 1; +} + static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; -- cgit From 4dc533e0f2c04174e1ae4aa98e7cffc1c04b9998 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 2 Nov 2022 16:49:17 +0800 Subject: kallsyms: Add helper kallsyms_on_each_match_symbol() Function kallsyms_on_each_symbol() traverses all symbols and submits each symbol to the hook 'fn' for judgment and processing. For some cases, the hook actually only handles the matched symbol, such as livepatch. Because all symbols are currently sorted by name, all the symbols with the same name are clustered together. Function kallsyms_lookup_names() gets the start and end positions of the set corresponding to the specified name. So we can easily and quickly traverse all the matches. The test results are as follows (twice): (x86) kallsyms_on_each_match_symbol: 7454, 7984 kallsyms_on_each_symbol : 11733809, 11785803 kallsyms_on_each_match_symbol() consumes only 0.066% of kallsyms_on_each_symbol()'s time. In other words, 1523x better performance. Signed-off-by: Zhen Lei Signed-off-by: Luis Chamberlain --- include/linux/kallsyms.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 649faac31ddb..0cd33be7142a 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -69,6 +69,8 @@ static inline void *dereference_symbol_descriptor(void *ptr) int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); +int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); @@ -168,6 +170,12 @@ static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct { return -EOPNOTSUPP; } + +static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data) +{ + return -EOPNOTSUPP; +} #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) -- cgit From 92125c3a602454824d70edb1b2abb382811cab4f Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 10 Nov 2022 15:32:17 -0600 Subject: ibmvnic: Add hotpluggable CPU callbacks to reassign affinity hints When CPU's are added and removed, ibmvnic devices will reassign hint values. Introduce a new cpu hotplug state CPUHP_IBMVNIC_DEAD to signal to ibmvnic devices that the CPU has been removed and it is time to reset affinity hint assignments. On the other hand, when CPU's are being added, add a state instance to CPUHP_AP_ONLINE_DYN which will trigger a reassignment of affinity hints once the new CPU's are online. This implementation is based on the virtio_net driver. Signed-off-by: Thomas Falcon Signed-off-by: Dany Madden Signed-off-by: Nick Child Reviewed-by: Rick Lindsley Reviewed-by: Haren Myneni Signed-off-by: David S. Miller --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f61447913db9..c8bc85a87b1e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -69,6 +69,7 @@ enum cpuhp_state { CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, + CPUHP_IBMVNIC_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, -- cgit From 42271ca389edb0446b9e492858b4c38083b0b9f8 Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Wed, 19 Oct 2022 18:04:07 +0200 Subject: lib/raid6: drop RAID6_USE_EMPTY_ZERO_PAGE RAID6_USE_EMPTY_ZERO_PAGE is unused and hardcoded to 0, so let's drop it. Signed-off-by: Giulio Benetti Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- include/linux/raid/pq.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index d6e5a1feb947..f29aaaf2eb21 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -10,17 +10,9 @@ #ifdef __KERNEL__ -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 #include -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif #else /* ! __KERNEL__ */ /* Used for testing in user space */ -- cgit From 278294798ac9118412c9624a801d3f20f2279363 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 26 Sep 2022 14:57:10 -0700 Subject: PCI: Allow drivers to request exclusive config regions PCI config space access from user space has traditionally been unrestricted with writes being an understood risk for device operation. Unfortunately, device breakage or odd behavior from config writes lacks indicators that can leave driver writers confused when evaluating failures. This is especially true with the new PCIe Data Object Exchange (DOE) mailbox protocol where backdoor shenanigans from user space through things such as vendor defined protocols may affect device operation without complete breakage. A prior proposal restricted read and writes completely.[1] Greg and Bjorn pointed out that proposal is flawed for a couple of reasons. First, lspci should always be allowed and should not interfere with any device operation. Second, setpci is a valuable tool that is sometimes necessary and it should not be completely restricted.[2] Finally methods exist for full lock of device access if required. Even though access should not be restricted it would be nice for driver writers to be able to flag critical parts of the config space such that interference from user space can be detected. Introduce pci_request_config_region_exclusive() to mark exclusive config regions. Such regions trigger a warning and kernel taint if accessed via user space. Create pci_warn_once() to restrict the user from spamming the log. [1] https://lore.kernel.org/all/161663543465.1867664.5674061943008380442.stgit@dwillia2-desk3.amr.corp.intel.com/ [2] https://lore.kernel.org/all/YF8NGeGv9vYcMfTV@kroah.com/ Cc: Bjorn Helgaas Cc: Greg Kroah-Hartman Reviewed-by: Jonathan Cameron Suggested-by: Dan Williams Signed-off-by: Ira Weiny Acked-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20220926215711.2893286-2-ira.weiny@intel.com Signed-off-by: Dan Williams --- include/linux/ioport.h | 2 ++ include/linux/pci.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 27642ca15d93..4ae3c541ea6f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -318,6 +318,8 @@ extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); extern bool iomem_is_exclusive(u64 addr); +extern bool resource_is_exclusive(struct resource *resource, u64 addr, + resource_size_t size); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, diff --git a/include/linux/pci.h b/include/linux/pci.h index 2bda4a4e47e8..575849a100a3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -409,6 +409,7 @@ struct pci_dev { */ unsigned int irq; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ + struct resource driver_exclusive_resource; /* driver exclusive resource ranges */ bool match_driver; /* Skip attaching driver */ @@ -1407,6 +1408,21 @@ int pci_request_selected_regions(struct pci_dev *, int, const char *); int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *); void pci_release_selected_regions(struct pci_dev *, int); +static inline __must_check struct resource * +pci_request_config_region_exclusive(struct pci_dev *pdev, unsigned int offset, + unsigned int len, const char *name) +{ + return __request_region(&pdev->driver_exclusive_resource, offset, len, + name, IORESOURCE_EXCLUSIVE); +} + +static inline void pci_release_config_region(struct pci_dev *pdev, + unsigned int offset, + unsigned int len) +{ + __release_region(&pdev->driver_exclusive_resource, offset, len); +} + /* drivers/pci/bus.c */ void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, @@ -2484,6 +2500,7 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #define pci_crit(pdev, fmt, arg...) dev_crit(&(pdev)->dev, fmt, ##arg) #define pci_err(pdev, fmt, arg...) dev_err(&(pdev)->dev, fmt, ##arg) #define pci_warn(pdev, fmt, arg...) dev_warn(&(pdev)->dev, fmt, ##arg) +#define pci_warn_once(pdev, fmt, arg...) dev_warn_once(&(pdev)->dev, fmt, ##arg) #define pci_notice(pdev, fmt, arg...) dev_notice(&(pdev)->dev, fmt, ##arg) #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) #define pci_dbg(pdev, fmt, arg...) dev_dbg(&(pdev)->dev, fmt, ##arg) -- cgit From 1156b4418db01b1d5a332bc399817d029acd2ec8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 28 Oct 2022 11:34:04 -0700 Subject: memregion: Add cpu_cache_invalidate_memregion() interface With CXL security features, and CXL dynamic provisioning, global CPU cache flushing nvdimm requirements are no longer specific to that subsystem, even beyond the scope of security_ops. CXL will need such semantics for features not necessarily limited to persistent memory. The functionality this is enabling is to be able to instantaneously secure erase potentially terabytes of memory at once and the kernel needs to be sure that none of the data from before the erase is still present in the cache. It is also used when unlocking a memory device where speculative reads and firmware accesses could have cached poison from before the device was unlocked. Lastly this facility is used when mapping new devices, or new capacity into an established physical address range. I.e. when the driver switches DeviceA mapping AddressX to DeviceB mapping AddressX then any cached data from DeviceA:AddressX needs to be invalidated. This capability is typically only used once per-boot (for unlock), or once per bare metal provisioning event (secure erase), like when handing off the system to another tenant or decommissioning a device. It may also be used for dynamic CXL region provisioning. Users must first call cpu_cache_has_invalidate_memregion() to know whether this functionality is available on the architecture. On x86 this respects the constraints of when wbinvd() is tolerable. It is already the case that wbinvd() is problematic to allow in VMs due its global performance impact and KVM, for example, has been known to just trap and ignore the call. With confidential computing guest execution of wbinvd() may even trigger an exception. Given guests should not be messing with the bare metal address map via CXL configuration changes cpu_cache_has_invalidate_memregion() returns false in VMs. While this global cache invalidation facility, is exported to modules, since NVDIMM and CXL support can be built as a module, it is not for general use. The intent is that this facility is not available outside of specific "device-memory" use cases. To make that expectation as clear as possible the API is scoped to a new "DEVMEM" module namespace that only the NVDIMM and CXL subsystems are expected to import. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Peter Zijlstra Tested-by: Dave Jiang Signed-off-by: Davidlohr Bueso Acked-by: Dave Hansen Co-developed-by: Dan Williams Signed-off-by: Dan Williams --- include/linux/memregion.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memregion.h b/include/linux/memregion.h index c04c4fd2e209..bf83363807ac 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -3,6 +3,7 @@ #define _MEMREGION_H_ #include #include +#include struct memregion_info { int target_node; @@ -20,4 +21,41 @@ static inline void memregion_free(int id) { } #endif + +/** + * cpu_cache_invalidate_memregion - drop any CPU cached data for + * memregions described by @res_desc + * @res_desc: one of the IORES_DESC_* types + * + * Perform cache maintenance after a memory event / operation that + * changes the contents of physical memory in a cache-incoherent manner. + * For example, device memory technologies like NVDIMM and CXL have + * device secure erase, and dynamic region provision that can replace + * the memory mapped to a given physical address. + * + * Limit the functionality to architectures that have an efficient way + * to writeback and invalidate potentially terabytes of address space at + * once. Note that this routine may or may not write back any dirty + * contents while performing the invalidation. It is only exported for + * the explicit usage of the NVDIMM and CXL modules in the 'DEVMEM' + * symbol namespace on bare platforms. + * + * Returns 0 on success or negative error code on a failure to perform + * the cache maintenance. + */ +#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION +int cpu_cache_invalidate_memregion(int res_desc); +bool cpu_cache_has_invalidate_memregion(void); +#else +static inline bool cpu_cache_has_invalidate_memregion(void) +{ + return false; +} + +static inline int cpu_cache_invalidate_memregion(int res_desc) +{ + WARN_ON_ONCE("CPU cache invalidation required"); + return -ENXIO; +} +#endif #endif /* _MEMREGION_H_ */ -- cgit From e81c782c16844dc758a784899c2fe5260386211b Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 10 Nov 2022 13:45:04 +0000 Subject: ACPI: Implement a generic FFH Opregion handler This registers the FFH OpRegion handler before ACPI tables are loaded. The platform support for the same is checked via Platform-Wide OSPM Capabilities(OSC) before registering the OpRegion handler. It relies on the special context data passed to offset and the length. However the interpretation of the values is platform/architecture specific. This generic handler just passed all the information to the platform/architecture specific callback. It also implements the default callbacks which return as not supported. Signed-off-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3015235d65e3..c026c1129cba 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -586,6 +586,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 #define OSC_SB_PRM_SUPPORT 0x00200000 +#define OSC_SB_FFH_OPR_SUPPORT 0x00400000 extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; @@ -1488,6 +1489,12 @@ void acpi_init_pcc(void); static inline void acpi_init_pcc(void) { } #endif +#ifdef CONFIG_ACPI_FFH +void acpi_init_ffh(void); +#else +static inline void acpi_init_ffh(void) { } +#endif + #ifdef CONFIG_ACPI extern void acpi_device_notify(struct device *dev); extern void acpi_device_notify_remove(struct device *dev); -- cgit From 4e016f969529f2aec0545e90119e7eb3cb124c46 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:18 +0200 Subject: vfio: Add an option to get migration data size Add an option to get migration data size by introducing a new migration feature named VFIO_DEVICE_FEATURE_MIG_DATA_SIZE. Upon VFIO_DEVICE_FEATURE_GET the estimated data length that will be required to complete STOP_COPY is returned. This option may better enable user space to consider before moving to STOP_COPY whether it can meet the downtime SLA based on the returned data. The patch also includes the implementation for mlx5 and hisi for this new option to make it feature complete for the existing drivers in this area. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20221106174630.25909-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7480154825e..43b67e46a2cb 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -107,6 +107,9 @@ struct vfio_device_ops { * @migration_get_state: Optional callback to get the migration state for * devices that support migration. It's mandatory for * VFIO_DEVICE_FEATURE_MIGRATION migration support. + * @migration_get_data_size: Optional callback to get the estimated data + * length that will be required to complete stop copy. It's mandatory for + * VFIO_DEVICE_FEATURE_MIGRATION migration support. */ struct vfio_migration_ops { struct file *(*migration_set_state)( @@ -114,6 +117,8 @@ struct vfio_migration_ops { enum vfio_device_mig_state new_state); int (*migration_get_state)(struct vfio_device *device, enum vfio_device_mig_state *curr_state); + int (*migration_get_data_size)(struct vfio_device *device, + unsigned long *stop_copy_length); }; /** -- cgit From 2d577252579b3efb9e934b68948a2edfa9920110 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:23 +0530 Subject: bpf: Remove BPF_MAP_OFF_ARR_MAX In f71b2f64177a ("bpf: Refactor map->off_arr handling"), map->off_arr was refactored to be btf_field_offs. The number of field offsets is equal to maximum possible fields limited by BTF_FIELDS_MAX. Hence, reuse BTF_FIELDS_MAX as spin_lock and timer no longer are to be handled specially for offset sorting, fix the comment, and remove incorrect WARN_ON as its rec->cnt can never exceed this value. The reason to keep separate constant was the it was always more 2 more than total kptrs. This is no longer the case. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 798aec816970..1a66a1df1af1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,9 +165,8 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 10, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, + /* Support at most 10 fields in a BTF type */ + BTF_FIELDS_MAX = 10, }; enum btf_field_type { @@ -203,8 +202,8 @@ struct btf_record { struct btf_field_offs { u32 cnt; - u32 field_off[BPF_MAP_OFF_ARR_MAX]; - u8 field_sz[BPF_MAP_OFF_ARR_MAX]; + u32 field_off[BTF_FIELDS_MAX]; + u8 field_sz[BTF_FIELDS_MAX]; }; struct bpf_map { -- cgit From e5feed0f64f73e167ef70755d3dc2db959d8fd5c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:24 +0530 Subject: bpf: Fix copy_map_value, zero_map_value The current offset needs to also skip over the already copied region in addition to the size of the next field. This case manifests where there are gaps between adjacent special fields. It was observed that for a map value with size 48, having fields at: off: 0, 16, 32 size: 4, 16, 16 The current code does: memcpy(dst + 0, src + 0, 0) memcpy(dst + 4, src + 4, 12) memcpy(dst + 20, src + 20, 12) memcpy(dst + 36, src + 36, 12) With the fix, it is done correctly as: memcpy(dst + 0, src + 0, 0) memcpy(dst + 4, src + 4, 12) memcpy(dst + 32, src + 32, 0) memcpy(dst + 48, src + 48, 0) Fixes: 4d7d7f69f4b1 ("bpf: Adapt copy_map_value for multiple offset case") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1a66a1df1af1..f08eb2d27de0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -360,7 +360,7 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += foffs->field_sz[i]; + curr_off += foffs->field_sz[i] + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } @@ -390,7 +390,7 @@ static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += foffs->field_sz[i]; + curr_off += foffs->field_sz[i] + sz; } memset(dst + curr_off, 0, size - curr_off); } -- cgit From f0c5941ff5b255413d31425bb327c2aec3625673 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:25 +0530 Subject: bpf: Support bpf_list_head in map values Add the support on the map side to parse, recognize, verify, and build metadata table for a new special field of the type struct bpf_list_head. To parameterize the bpf_list_head for a certain value type and the list_node member it will accept in that value type, we use BTF declaration tags. The definition of bpf_list_head in a map value will be done as follows: struct foo { struct bpf_list_node node; int data; }; struct map_value { struct bpf_list_head head __contains(foo, node); }; Then, the bpf_list_head only allows adding to the list 'head' using the bpf_list_node 'node' for the type struct foo. The 'contains' annotation is a BTF declaration tag composed of four parts, "contains:name:node" where the name is then used to look up the type in the map BTF, with its kind hardcoded to BTF_KIND_STRUCT during the lookup. The node defines name of the member in this type that has the type struct bpf_list_node, which is actually used for linking into the linked list. For now, 'kind' part is hardcoded as struct. This allows building intrusive linked lists in BPF, using container_of to obtain pointer to entry, while being completely type safe from the perspective of the verifier. The verifier knows exactly the type of the nodes, and knows that list helpers return that type at some fixed offset where the bpf_list_node member used for this list exists. The verifier also uses this information to disallow adding types that are not accepted by a certain list. For now, no elements can be added to such lists. Support for that is coming in future patches, hence draining and freeing items is done with a TODO that will be resolved in a future patch. Note that the bpf_list_head_free function moves the list out to a local variable under the lock and releases it, doing the actual draining of the list items outside the lock. While this helps with not holding the lock for too long pessimizing other concurrent list operations, it is also necessary for deadlock prevention: unless every function called in the critical section would be notrace, a fentry/fexit program could attach and call bpf_map_update_elem again on the map, leading to the same lock being acquired if the key matches and lead to a deadlock. While this requires some special effort on part of the BPF programmer to trigger and is highly unlikely to occur in practice, it is always better if we can avoid such a condition. While notrace would prevent this, doing the draining outside the lock has advantages of its own, hence it is used to also fix the deadlock related problem. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f08eb2d27de0..05f98e9e5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -175,6 +175,7 @@ enum btf_field_type { BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, + BPF_LIST_HEAD = (1 << 4), }; struct btf_field_kptr { @@ -184,11 +185,18 @@ struct btf_field_kptr { u32 btf_id; }; +struct btf_field_list_head { + struct btf *btf; + u32 value_btf_id; + u32 node_offset; +}; + struct btf_field { u32 offset; enum btf_field_type type; union { struct btf_field_kptr kptr; + struct btf_field_list_head list_head; }; }; @@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; + case BPF_LIST_HEAD: + return "bpf_list_head"; default: WARN_ON_ONCE(1); return "unknown"; @@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); + case BPF_LIST_HEAD: + return sizeof(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); + case BPF_LIST_HEAD: + return __alignof__(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock); + int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; -- cgit From 2de2669b4e52b2ae2f118bfc310004f50b47f0f5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:26 +0530 Subject: bpf: Rename RET_PTR_TO_ALLOC_MEM Currently, the verifier has two return types, RET_PTR_TO_ALLOC_MEM, and RET_PTR_TO_ALLOC_MEM_OR_NULL, however the former is confusingly named to imply that it carries MEM_ALLOC, while only the latter does. This causes confusion during code review leading to conclusions like that the return value of RET_PTR_TO_DYNPTR_MEM_OR_NULL (which is RET_PTR_TO_ALLOC_MEM | PTR_MAYBE_NULL) may be consumable by bpf_ringbuf_{submit,commit}. Rename it to make it clear MEM_ALLOC needs to be tacked on top of RET_PTR_TO_MEM. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05f98e9e5c48..2fe3ec620d54 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -607,7 +607,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ - RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ + RET_PTR_TO_MEM, /* returns a pointer to memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, @@ -617,8 +617,8 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, - RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM, + RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit From 894f2a8b1673a355a1a7507a4dfa6a3c836d07c1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:27 +0530 Subject: bpf: Rename MEM_ALLOC to MEM_RINGBUF Currently, verifier uses MEM_ALLOC type tag to specially tag memory returned from bpf_ringbuf_reserve helper. However, this is currently only used for this purpose and there is an implicit assumption that it only refers to ringbuf memory (e.g. the check for ARG_PTR_TO_ALLOC_MEM in check_func_arg_reg_off). Hence, rename MEM_ALLOC to MEM_RINGBUF to indicate this special relationship and instead open the use of MEM_ALLOC for more generic allocations made for user types. Also, since ARG_PTR_TO_ALLOC_MEM_OR_NULL is unused, simply drop it. Finally, update selftests using 'alloc_' verifier string to 'ringbuf_'. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fe3ec620d54..afc1c51b59ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,10 +488,8 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - /* MEM was "allocated" from a different helper, and cannot be mixed - * with regular non-MEM_ALLOC'ed MEM types. - */ - MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + /* MEM points to BPF ring buffer reservation. */ + MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS), /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), @@ -565,7 +563,7 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ - ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ @@ -582,7 +580,6 @@ enum bpf_arg_type { ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, - ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* pointer to memory does not need to be initialized, helper function must fill @@ -617,7 +614,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM, + RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, -- cgit From 6728aea7216c0c06c98e2e58d753a5e8b2ae1c6f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:28 +0530 Subject: bpf: Refactor btf_struct_access Instead of having to pass multiple arguments that describe the register, pass the bpf_reg_state into the btf_struct_access callback. Currently, all call sites simply reuse the btf and btf_id of the reg they want to check the access of. The only exception to this pattern is the callsite in check_ptr_to_map_access, hence for that case create a dummy reg to simulate PTR_TO_BTF_ID access. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 ++++++++--------- include/linux/filter.h | 8 ++++---- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index afc1c51b59ff..49f9d2bec401 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -771,6 +771,7 @@ struct bpf_prog_ops { union bpf_attr __user *uattr); }; +struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * @@ -792,9 +793,8 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); }; @@ -2080,9 +2080,9 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, return btf_ctx_access(off, size, type, prog, info); } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, @@ -2333,9 +2333,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) } static inline int btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag) { return -EACCES; diff --git a/include/linux/filter.h b/include/linux/filter.h index efc42a6e3aed..787d35dbf5b0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -568,10 +568,10 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, -- cgit From 30f3bb09778de64ef9f23fb4bb5f868c4728a071 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 15 Nov 2022 16:33:48 +0800 Subject: kallsyms: Add self-test facility Added test cases for basic functions and performance of functions kallsyms_lookup_name(), kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol(). It also calculates the compression rate of the kallsyms compression algorithm for the current symbol set. The basic functions test begins by testing a set of symbols whose address values are known. Then, traverse all symbol addresses and find the corresponding symbol name based on the address. It's impossible to determine whether these addresses are correct, but we can use the above three functions along with the addresses to test each other. Due to the traversal operation of kallsyms_on_each_symbol() is too slow, only 60 symbols can be tested in one second, so let it test on average once every 128 symbols. The other two functions validate all symbols. If the basic functions test is passed, print only performance test results. If the test fails, print error information, but do not perform subsequent performance tests. Start self-test automatically after system startup if CONFIG_KALLSYMS_SELFTEST=y. Example of output content: (prefix 'kallsyms_selftest:' is omitted start --------------------------------------------------------- | nr_symbols | compressed size | original size | ratio(%) | |---------------------------------------------------------| | 107543 | 1357912 | 2407433 | 56.40 | --------------------------------------------------------- kallsyms_lookup_name() looked up 107543 symbols The time spent on each symbol is (ns): min=630, max=35295, avg=7353 kallsyms_on_each_symbol() traverse all: 11782628 ns kallsyms_on_each_match_symbol() traverse all: 9261 ns finish Signed-off-by: Zhen Lei Signed-off-by: Luis Chamberlain --- include/linux/kallsyms.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 0cd33be7142a..0065209cc004 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -66,6 +66,7 @@ static inline void *dereference_symbol_descriptor(void *ptr) } #ifdef CONFIG_KALLSYMS +unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -- cgit From 855b7717f44b13e0990aa5ad36bbf9aa35051516 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 31 Oct 2022 21:53:50 +0530 Subject: nvme: fine-granular CAP_SYS_ADMIN for nvme io commands Currently both io and admin commands are kept under a coarse-granular CAP_SYS_ADMIN check, disregarding file mode completely. $ ls -l /dev/ng* crw-rw-rw- 1 root root 242, 0 Sep 9 19:20 /dev/ng0n1 crw------- 1 root root 242, 1 Sep 9 19:20 /dev/ng0n2 In the example above, ng0n1 appears as if it may allow unprivileged read/write operation but it does not and behaves same as ng0n2. This patch implements a shift from CAP_SYS_ADMIN to more fine-granular control for io-commands. If CAP_SYS_ADMIN is present, nothing else is checked as before. Otherwise, following rules are in place - any admin-cmd is not allowed - vendor-specific and fabric commmand are not allowed - io-commands that can write are allowed if matching FMODE_WRITE permission is present - io-commands that read are allowed Add a helper nvme_cmd_allowed that implements above policy. Change all the callers of CAP_SYS_ADMIN to go through nvme_cmd_allowed for any decision making. Since file open mode is counted for any approval/denial, change at various places to keep file-mode information handy. Signed-off-by: Kanchan Joshi Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 050d7d0cd81b..1d102b662e88 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -797,6 +797,7 @@ enum nvme_opcode { nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, nvme_cmd_zone_append = 0x7d, + nvme_cmd_vendor_start = 0x80, }; #define nvme_opcode_name(opcode) { opcode, #opcode } -- cgit From 1b96f862ecccb3e6f950eba584bebf22955cecc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 16:50:15 +0100 Subject: nvme: implement the DEAC bit for the Write Zeroes command While the specification allows devices to either deallocate data or to actually write zeroes on any Write Zeroes command, many SSDs only do the sensible thing and deallocate data when the DEAC bit is specific. Set it when it is supported and the caller doesn't explicitly opt out of deallocation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1d102b662e88..d6be2a686100 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -964,6 +964,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_WZ_DEAC = 1 << 9, }; struct nvme_dsm_cmd { -- cgit From e7f9ff5dc90c3826231343439c35c6b7e9e57378 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 11 Nov 2022 14:19:08 -0800 Subject: gpiolib: add support for software nodes Now that static device properties understand notion of child nodes and references, let's teach gpiolib to handle them: - GPIOs are represented as a references to software nodes representing gpiochip - references must have 2 arguments - GPIO number within the chip and GPIO flags (GPIO_ACTIVE_LOW/GPIO_ACTIVE_HIGH, etc) - a new PROPERTY_ENTRY_GPIO() macro is supplied to ensure the above - name of the software node representing gpiochip must match label of the gpiochip, as we use it to locate gpiochip structure at runtime The following illustrates use of software nodes to describe a "System" button that is currently specified via use of gpio_keys_platform_data in arch/mips/alchemy/board-mtx1.c. It follows bindings specified in Documentation/devicetree/bindings/input/gpio-keys.yaml. static const struct software_node mxt1_gpiochip2_node = { .name = "alchemy-gpio2", }; static const struct property_entry mtx1_gpio_button_props[] = { PROPERTY_ENTRY_U32("linux,code", BTN_0), PROPERTY_ENTRY_STRING("label", "System button"), PROPERTY_ENTRY_GPIO("gpios", &mxt1_gpiochip2_node, 7, GPIO_ACTIVE_LOW), { } }; Similarly, arch/arm/mach-tegra/board-paz00.c can be converted to: static const struct software_node tegra_gpiochip_node = { .name = "tegra-gpio", }; static struct property_entry wifi_rfkill_prop[] __initdata = { PROPERTY_ENTRY_STRING("name", "wifi_rfkill"), PROPERTY_ENTRY_STRING("type", "wlan"), PROPERTY_ENTRY_GPIO("reset-gpios", &tegra_gpiochip_node, 25, GPIO_ACTIVE_HIGH); PROPERTY_ENTRY_GPIO("shutdown-gpios", &tegra_gpiochip_node, 85, GPIO_ACTIVE_HIGH); { }, }; static struct platform_device wifi_rfkill_device = { .name = "rfkill_gpio", .id = -1, }; ... software_node_register(&tegra_gpiochip_node); device_create_managed_software_node(&wifi_rfkill_device.dev, wifi_rfkill_prop, NULL); Acked-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/property.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/gpio/property.h (limited to 'include/linux') diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h new file mode 100644 index 000000000000..6c75c8bd44a0 --- /dev/null +++ b/include/linux/gpio/property.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ +#ifndef __LINUX_GPIO_PROPERTY_H +#define __LINUX_GPIO_PROPERTY_H + +#include /* for GPIO_* flags */ +#include + +#define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ + PROPERTY_ENTRY_REF(_name_, _chip_node_, _idx_, _flags_) + +#endif /* __LINUX_GPIO_PROPERTY_H */ -- cgit From 14d898f3c1b3bf9c4375ee3255ec9e9b89a35578 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 8 Nov 2022 15:05:59 +0100 Subject: dev: Move received_rps counter next to RPS members in softnet data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the received_rps counter value next to the other RPS-related members in softnet_data. This closes two four-byte holes in the structure, making room for another pointer in the first two cache lines without bumping the xmit struct to its own line. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..ddc59ef98500 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3128,7 +3128,6 @@ struct softnet_data { /* stats */ unsigned int processed; unsigned int time_squeeze; - unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif @@ -3161,6 +3160,7 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif + unsigned int received_rps; unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; -- cgit From 32637e33003f36e75e9147788cc0e2f21706ef99 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 8 Nov 2022 15:06:00 +0100 Subject: bpf: Expand map key argument of bpf_redirect_map to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For queueing packets in XDP we want to add a new redirect map type with support for 64-bit indexes. To prepare fore this, expand the width of the 'key' argument to the bpf_redirect_map() helper. Since BPF registers are always 64-bit, this should be safe to do after the fact. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-3-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/filter.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49f9d2bec401..54462dd28824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -135,7 +135,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure diff --git a/include/linux/filter.h b/include/linux/filter.h index 787d35dbf5b0..bf701976056e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -643,13 +643,13 @@ struct bpf_nh_params { }; struct bpf_redirect_info { - u32 flags; - u32 tgt_index; + u64 tgt_index; void *tgt_value; struct bpf_map *map; + u32 flags; + u32 kern_flags; u32 map_id; enum bpf_map_type map_type; - u32 kern_flags; struct bpf_nh_params nh; }; @@ -1504,7 +1504,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { @@ -1515,7 +1515,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; - ri->tgt_value = lookup_elem(map, ifindex); + ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program @@ -1527,7 +1527,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind return flags & action_mask; } - ri->tgt_index = ifindex; + ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; -- cgit From 597d77d29c5cc50b52bbbbd48c49a0237481a9df Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 15 Oct 2022 00:22:35 +0300 Subject: regset: make user_regset_copyin_ignore() *void* user_regset_copyin_ignore() apparently cannot fail and so always returns 0. Let's make this function return *void* instead of *int*... Link: https://lkml.kernel.org/r/20221014212235.10770-14-s.shtylyov@omp.ru Signed-off-by: Sergey Shtylyov Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Helge Deller Cc: James Bottomley Cc: Jonas Bonn Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/regset.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regset.h b/include/linux/regset.h index a00765f0e8cf..9061266dd8de 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -275,15 +275,15 @@ static inline int user_regset_copyin(unsigned int *pos, unsigned int *count, return 0; } -static inline int user_regset_copyin_ignore(unsigned int *pos, - unsigned int *count, - const void **kbuf, - const void __user **ubuf, - const int start_pos, - const int end_pos) +static inline void user_regset_copyin_ignore(unsigned int *pos, + unsigned int *count, + const void **kbuf, + const void __user **ubuf, + const int start_pos, + const int end_pos) { if (*count == 0) - return 0; + return; BUG_ON(*pos < start_pos); if (end_pos < 0 || *pos < end_pos) { unsigned int copy = (end_pos < 0 ? *count @@ -295,7 +295,6 @@ static inline int user_regset_copyin_ignore(unsigned int *pos, *pos += copy; *count -= copy; } - return 0; } extern int regset_get(struct task_struct *target, -- cgit From 06463f6e98df34908c26aa8e7a31a279646b1f51 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 9 Nov 2022 14:42:49 -0800 Subject: wifi: wl1251: drop support for platform data Remove support for configuring the device via platform data because there are no users of wl1251_platform_data left in the mainline kernel. Signed-off-by: Dmitry Torokhov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221109224250.2885119-2-dmitry.torokhov@gmail.com --- include/linux/wl12xx.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 include/linux/wl12xx.h (limited to 'include/linux') diff --git a/include/linux/wl12xx.h b/include/linux/wl12xx.h deleted file mode 100644 index 03d61f1d23ab..000000000000 --- a/include/linux/wl12xx.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * This file is part of wl12xx - * - * Copyright (C) 2009 Nokia Corporation - * - * Contact: Luciano Coelho - */ - -#ifndef _LINUX_WL12XX_H -#define _LINUX_WL12XX_H - -#include - -struct wl1251_platform_data { - int power_gpio; - /* SDIO only: IRQ number if WLAN_IRQ line is used, 0 for SDIO IRQs */ - int irq; - bool use_eeprom; -}; - -#ifdef CONFIG_WILINK_PLATFORM_DATA - -int wl1251_set_platform_data(const struct wl1251_platform_data *data); - -struct wl1251_platform_data *wl1251_get_platform_data(void); - -#else - -static inline -int wl1251_set_platform_data(const struct wl1251_platform_data *data) -{ - return -ENOSYS; -} - -static inline -struct wl1251_platform_data *wl1251_get_platform_data(void) -{ - return ERR_PTR(-ENODATA); -} - -#endif - -#endif -- cgit From 9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:57 -0800 Subject: udp: Introduce optional per-netns hash table. The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index dea57aa37df6..a2892e151644 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) return (struct udphdr *)skb_transport_header(skb); } +#define UDP_HTABLE_SIZE_MIN_PERNET 128 #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) +#define UDP_HTABLE_SIZE_MAX 65536 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { -- cgit From 6c1c5097781f563b70a81683ea6fdac21637573b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:55 +0000 Subject: net: add atomic_long_t to net_device_stats fields Long standing KCSAN issues are caused by data-race around some dev->stats changes. Most performance critical paths already use per-cpu variables, or per-queue ones. It is reasonable (and more correct) to use atomic operations for the slow paths. This patch adds an union for each field of net_device_stats, so that we can convert paths that are not yet protected by a spinlock or a mutex. netdev_stats_to_stats64() no longer has an #if BITS_PER_LONG==64 Note that the memcpy() we were using on 64bit arches had no provision to avoid load-tearing, while atomic_long_read() is providing the needed protection at no cost. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 58 ++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..23b3903b0678 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -171,31 +171,38 @@ static inline bool dev_xmit_complete(int rc) * (unsigned long) so they can be read and written atomically. */ +#define NET_DEV_STAT(FIELD) \ + union { \ + unsigned long FIELD; \ + atomic_long_t __##FIELD; \ + } + struct net_device_stats { - unsigned long rx_packets; - unsigned long tx_packets; - unsigned long rx_bytes; - unsigned long tx_bytes; - unsigned long rx_errors; - unsigned long tx_errors; - unsigned long rx_dropped; - unsigned long tx_dropped; - unsigned long multicast; - unsigned long collisions; - unsigned long rx_length_errors; - unsigned long rx_over_errors; - unsigned long rx_crc_errors; - unsigned long rx_frame_errors; - unsigned long rx_fifo_errors; - unsigned long rx_missed_errors; - unsigned long tx_aborted_errors; - unsigned long tx_carrier_errors; - unsigned long tx_fifo_errors; - unsigned long tx_heartbeat_errors; - unsigned long tx_window_errors; - unsigned long rx_compressed; - unsigned long tx_compressed; + NET_DEV_STAT(rx_packets); + NET_DEV_STAT(tx_packets); + NET_DEV_STAT(rx_bytes); + NET_DEV_STAT(tx_bytes); + NET_DEV_STAT(rx_errors); + NET_DEV_STAT(tx_errors); + NET_DEV_STAT(rx_dropped); + NET_DEV_STAT(tx_dropped); + NET_DEV_STAT(multicast); + NET_DEV_STAT(collisions); + NET_DEV_STAT(rx_length_errors); + NET_DEV_STAT(rx_over_errors); + NET_DEV_STAT(rx_crc_errors); + NET_DEV_STAT(rx_frame_errors); + NET_DEV_STAT(rx_fifo_errors); + NET_DEV_STAT(rx_missed_errors); + NET_DEV_STAT(tx_aborted_errors); + NET_DEV_STAT(tx_carrier_errors); + NET_DEV_STAT(tx_fifo_errors); + NET_DEV_STAT(tx_heartbeat_errors); + NET_DEV_STAT(tx_window_errors); + NET_DEV_STAT(rx_compressed); + NET_DEV_STAT(tx_compressed); }; +#undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. @@ -5171,4 +5178,9 @@ extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; +/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */ +#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD) +#define DEV_STATS_ADD(DEV, FIELD, VAL) \ + atomic_long_add((VAL), &(DEV)->stats.__##FIELD) + #endif /* _LINUX_NETDEVICE_H */ -- cgit From 6e4068a11413b96687a03c39814539e202de294b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Nov 2022 15:18:19 +0000 Subject: mempool: introduce mempool_is_saturated Introduce a helper mempool_is_saturated(), which tells if the mempool is under-filled or not. We need it to figure out whether it should be freed right into the mempool or could be cached with top level caches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/636aed30be8c35d78f45e244998bc6209283cccc.1667384020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/mempool.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 0c964ac107c2..4aae6c06c5f2 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -30,6 +30,11 @@ static inline bool mempool_initialized(mempool_t *pool) return pool->elements != NULL; } +static inline bool mempool_is_saturated(mempool_t *pool) +{ + return READ_ONCE(pool->curr_nr) >= pool->min_nr; +} + void mempool_exit(mempool_t *pool); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, -- cgit From ee7dc86b6d3e3b86c2c487f713eda657850de238 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 15 Nov 2022 17:45:52 -0500 Subject: wait: Return number of exclusive waiters awaken Sbitmap code will need to know how many waiters were actually woken for its batched wakeups implementation. Return the number of woken exclusive waiters from __wake_up() to facilitate that. Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221115224553.23594-3-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 7f5a51aae0a7..a0307b516b09 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -209,7 +209,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq list_del(&wq_entry->entry); } -void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); -- cgit From 7abc077788363ac7194aefd355306f8e974feff7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 22:10:51 +0800 Subject: block: remove delayed holder registration Now that dm has been fixed to track of holder registrations before add_disk, the somewhat buggy block layer code can be safely removed. Signed-off-by: Christoph Hellwig Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20221115141054.1051801-8-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9188aa3f6259..516e45246868 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -833,7 +833,6 @@ void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); -int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -844,10 +843,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } -static inline int bd_register_pending_holders(struct gendisk *disk) -{ - return 0; -} #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); -- cgit From 5e7a0af793ce7ed528117145f856224deb81b7a7 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 16 Nov 2022 11:08:19 -0800 Subject: fixp-arith: do not require users to include bug.h The fixp_sin32_rad() contains a call to BUG_ON(). If users of fixp-arith.h have not included the bug.h prior including the fixp-arith.h the compiler will not find the BUG_ON. Thus every user of fixp-arith.h must also include bug.h (or roll own variant of BUG_ON()). Include bug.h from fixp-arith.h so every user of fixp-arith.h does not need to include the bug.h prior inclusion of fixp-arith.h Fixes: 559addc25b00 ("[media] fixp-arith: replace sin/cos table by a better precision one") Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/Y3SgVdVey9legtX+@dc75zzyyyyyyyyyyyyydt-3.rev.dnainternet.fi Signed-off-by: Dmitry Torokhov --- include/linux/fixp-arith.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fixp-arith.h b/include/linux/fixp-arith.h index 281cb4f83dbe..e485fb0c1201 100644 --- a/include/linux/fixp-arith.h +++ b/include/linux/fixp-arith.h @@ -2,6 +2,7 @@ #ifndef _FIXP_ARITH_H #define _FIXP_ARITH_H +#include #include /* -- cgit From dae590a6c96c799434e0ff8156ef29b88c257e60 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:02 -0400 Subject: blk-cgroup: Flush stats at blkgs destruction path As noted by Michal, the blkg_iostat_set's in the lockless list hold reference to blkg's to protect against their removal. Those blkg's hold reference to blkcg. When a cgroup is being destroyed, cgroup_rstat_flush() is only called at css_release_work_fn() which is called when the blkcg reference count reaches 0. This circular dependency will prevent blkcg from being freed until some other events cause cgroup_rstat_flush() to be called to flush out the pending blkcg stats. To prevent this delayed blkcg removal, add a new cgroup_rstat_css_flush() function to flush stats for a given css and cpu and call it at the blkgs destruction path, blkcg_destroy_blkgs(), whenever there are still some pending stats to be flushed. This will ensure that blkcg reference count can reach 0 ASAP. Signed-off-by: Waiman Long Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-4-longman@redhat.com Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44b59e2..6c4e66b3fa84 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,6 +766,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. -- cgit From 196270c5d6f3cfabd609e6202858fbccec14691f Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Tue, 15 Nov 2022 17:54:14 +0000 Subject: pinconf-generic: clarify pull up and pull down config values PIN_CONFIG_BIAS_PULL_DOWN and PIN_CONFIG_BIAS_PULL_UP values can be custom or an SI unit such as ohms Signed-off-by: Niyas Sait Link: https://lore.kernel.org/r/20221115175415.650690-3-niyas.sait@linaro.org Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 940fc4e9e17c..c9c10142657e 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -38,7 +38,8 @@ struct pinctrl_map; * impedance. * @PIN_CONFIG_BIAS_PULL_DOWN: the pin will be pulled down (usually with high * impedance to GROUND). If the argument is != 0 pull-down is enabled, - * if it is 0, pull-down is total, i.e. the pin is connected to GROUND. + * the value is interpreted by the driver and can be custom or an SI unit + * such as ohms. * @PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: the pin will be pulled up or down based * on embedded knowledge of the controller hardware, like current mux * function. The pull direction and possibly strength too will normally @@ -49,7 +50,8 @@ struct pinctrl_map; * @PIN_CONFIG_BIAS_DISABLE. * @PIN_CONFIG_BIAS_PULL_UP: the pin will be pulled up (usually with high * impedance to VDD). If the argument is != 0 pull-up is enabled, - * if it is 0, pull-up is total, i.e. the pin is connected to VDD. + * the value is interpreted by the driver and can be custom or an SI unit + * such as ohms. * @PIN_CONFIG_DRIVE_OPEN_DRAIN: the pin will be driven with open drain (open * collector) which means it is usually wired with other output ports * which are then pulled up with an external resistor. Setting this -- cgit From 2f117484329b233455ee278f2d9b0a4356835060 Mon Sep 17 00:00:00 2001 From: Barnabás Pőcze Date: Mon, 14 Nov 2022 19:54:23 +0000 Subject: timerqueue: Use rb_entry_safe() in timerqueue_getnext() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `timerqueue_getnext()` is called on an empty timer queue, it will use `rb_entry()` on a NULL pointer, which is invalid. Fix that by using `rb_entry_safe()` which handles NULL pointers. This has not caused any issues so far because the offset of the `rb_node` member in `timerqueue_node` is 0, so `rb_entry()` is essentially a no-op. Fixes: 511885d7061e ("lib/timerqueue: Rely on rbtree semantics for next timer") Signed-off-by: Barnabás Pőcze Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221114195421.342929-1-pobrn@protonmail.com --- include/linux/timerqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 93884086f392..adc80e29168e 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -35,7 +35,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); - return rb_entry(leftmost, struct timerqueue_node, node); + return rb_entry_safe(leftmost, struct timerqueue_node, node); } static inline void timerqueue_init(struct timerqueue_node *node) -- cgit From 2f2940d168236a92df524a1bd99fc7b0325918b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:22 +0100 Subject: genirq/msi: Remove filter from msi_free_descs_free_range() When a range of descriptors is freed then all of them are not associated to a linux interrupt. Remove the filter and add a warning to the free function. Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122013.888850936@linutronix.de --- include/linux/msi.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index fc918a658d48..969ce46369b3 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -247,8 +247,7 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) #endif /* CONFIG_PCI_MSI */ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); -void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, - unsigned int first_index, unsigned int last_index); +void msi_free_msi_descs_range(struct device *dev, unsigned int first_index, unsigned int last_index); /** * msi_free_msi_descs - Free MSI descriptors of a device @@ -256,7 +255,7 @@ void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, */ static inline void msi_free_msi_descs(struct device *dev) { - msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX); + msi_free_msi_descs_range(dev, 0, MSI_MAX_INDEX); } void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); -- cgit From 762687ceb31fc296e2e1406559e8bb50251c5277 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:25 +0100 Subject: genirq/msi: Make __msi_domain_alloc_irqs() static Nothing outside of the core code requires this. Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122014.004725919@linutronix.de --- include/linux/msi.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 969ce46369b3..9b552ee58a37 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -334,9 +334,8 @@ struct msi_domain_info; * MSI_FLAG_USE_DEF_DOM_OPS is not set to avoid breaking existing users and * because these callbacks are obviously mandatory. * - * This is NOT meant to be abused, but it can be useful to build wrappers - * for specialized MSI irq domains which need extra work before and after - * calling __msi_domain_alloc_irqs()/__msi_domain_free_irqs(). + * __msi_domain_free_irqs() is exposed for PPC pseries to handle extra + * work after all interrupts and descriptors have been freed. */ struct msi_domain_ops { irq_hw_number_t (*get_hwirq)(struct msi_domain_info *info, @@ -425,8 +424,6 @@ int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); -int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec); int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, int nvec); int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, -- cgit From f6d3486a3d2f3c67d732641834eec872fcc66472 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:27 +0100 Subject: genirq/msi: Provide msi_domain_ops:: Post_free() To prepare for removing the exposure of __msi_domain_free_irqs() provide a post_free() callback in the MSI domain ops which can be used to solve the problem of the only user of __msi_domain_free_irqs() in arch/powerpc. Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122014.063153448@linutronix.de --- include/linux/msi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 9b552ee58a37..7f6aba9d6b4e 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -315,6 +315,8 @@ struct msi_domain_info; * function. * @domain_free_irqs: Optional function to override the default free * function. + * @msi_post_free: Optional function which is invoked after freeing + * all interrupts. * * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. @@ -359,6 +361,8 @@ struct msi_domain_ops { struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, struct device *dev); + void (*msi_post_free)(struct irq_domain *domain, + struct device *dev); }; /** -- cgit From 057c97a1cd665ebb38841418adcd35f57b33cc21 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:30 +0100 Subject: genirq/msi: Make __msi_domain_free_irqs() static Now that the last user is gone, confine it to the core code. Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122014.179595843@linutronix.de --- include/linux/msi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 7f6aba9d6b4e..ee735ffeabd4 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -335,9 +335,6 @@ struct msi_domain_info; * are set to the default implementation if NULL and even when * MSI_FLAG_USE_DEF_DOM_OPS is not set to avoid breaking existing users and * because these callbacks are obviously mandatory. - * - * __msi_domain_free_irqs() is exposed for PPC pseries to handle extra - * work after all interrupts and descriptors have been freed. */ struct msi_domain_ops { irq_hw_number_t (*get_hwirq)(struct msi_domain_info *info, @@ -432,7 +429,6 @@ int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device int nvec); int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); -void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev); void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); -- cgit From aeef20527c87fed37c6f159d9eafd063a099f6ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:32 +0100 Subject: genirq/irqdomain: Move bus token enum into a seperate header Split the bus token defines out into a seperate header file to avoid inclusion of irqdomain.h in msi.h. Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122014.237221143@linutronix.de --- include/linux/irqdomain.h | 22 +--------------------- include/linux/irqdomain_defs.h | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 21 deletions(-) create mode 100644 include/linux/irqdomain_defs.h (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 00d577f90883..a4af7f81661b 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -31,6 +31,7 @@ #define _LINUX_IRQDOMAIN_H #include +#include #include #include #include @@ -68,27 +69,6 @@ struct irq_fwspec { void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec); -/* - * Should several domains have the same device node, but serve - * different purposes (for example one domain is for PCI/MSI, and the - * other for wired IRQs), they can be distinguished using a - * bus-specific token. Most domains are expected to only carry - * DOMAIN_BUS_ANY. - */ -enum irq_domain_bus_token { - DOMAIN_BUS_ANY = 0, - DOMAIN_BUS_WIRED, - DOMAIN_BUS_GENERIC_MSI, - DOMAIN_BUS_PCI_MSI, - DOMAIN_BUS_PLATFORM_MSI, - DOMAIN_BUS_NEXUS, - DOMAIN_BUS_IPI, - DOMAIN_BUS_FSL_MC_MSI, - DOMAIN_BUS_TI_SCI_INTA_MSI, - DOMAIN_BUS_WAKEUP, - DOMAIN_BUS_VMD_MSI, -}; - /** * struct irq_domain_ops - Methods for irq_domain objects * @match: Match an interrupt controller device node to a host, returns diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h new file mode 100644 index 000000000000..69035b4f740a --- /dev/null +++ b/include/linux/irqdomain_defs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQDOMAIN_DEFS_H +#define _LINUX_IRQDOMAIN_DEFS_H + +/* + * Should several domains have the same device node, but serve + * different purposes (for example one domain is for PCI/MSI, and the + * other for wired IRQs), they can be distinguished using a + * bus-specific token. Most domains are expected to only carry + * DOMAIN_BUS_ANY. + */ +enum irq_domain_bus_token { + DOMAIN_BUS_ANY = 0, + DOMAIN_BUS_WIRED, + DOMAIN_BUS_GENERIC_MSI, + DOMAIN_BUS_PCI_MSI, + DOMAIN_BUS_PLATFORM_MSI, + DOMAIN_BUS_NEXUS, + DOMAIN_BUS_IPI, + DOMAIN_BUS_FSL_MC_MSI, + DOMAIN_BUS_TI_SCI_INTA_MSI, + DOMAIN_BUS_WAKEUP, + DOMAIN_BUS_VMD_MSI, +}; + +#endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- cgit From 22db089a4437a72277677f99717af499560b13f2 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 11 Nov 2022 14:54:33 +0100 Subject: genirq/msi: Add bus token to struct msi_domain_info Add a bus token member to struct msi_domain_info and let msi_create_irq_domain() set the bus token. That allows to remove the bus token updates at the call sites. Suggested-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122014.294554462@linutronix.de --- include/linux/msi.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index ee735ffeabd4..2dfd7b2bdfbe 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -16,6 +16,7 @@ * abuse. The only function which is relevant for drivers is msi_get_virq(). */ +#include #include #include #include @@ -365,6 +366,7 @@ struct msi_domain_ops { /** * struct msi_domain_info - MSI interrupt domain data * @flags: Flags to decribe features and capabilities + * @bus_token: The domain bus token * @ops: The callback data structure * @chip: Optional: associated interrupt chip * @chip_data: Optional: associated interrupt chip data @@ -374,14 +376,15 @@ struct msi_domain_ops { * @data: Optional: domain specific data */ struct msi_domain_info { - u32 flags; - struct msi_domain_ops *ops; - struct irq_chip *chip; - void *chip_data; - irq_flow_handler_t handler; - void *handler_data; - const char *handler_name; - void *data; + u32 flags; + enum irq_domain_bus_token bus_token; + struct msi_domain_ops *ops; + struct irq_chip *chip; + void *chip_data; + irq_flow_handler_t handler; + void *handler_data; + const char *handler_name; + void *data; }; /* Flags for msi_domain_info */ -- cgit From a474d3fbe287625c6c1cfc56c2a456c5fb7c479e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:38 +0100 Subject: PCI/MSI: Get rid of PCI_MSI_IRQ_DOMAIN What a zoo: PCI_MSI select GENERIC_MSI_IRQ PCI_MSI_IRQ_DOMAIN def_bool y depends on PCI_MSI select GENERIC_MSI_IRQ_DOMAIN Ergo PCI_MSI enables PCI_MSI_IRQ_DOMAIN which in turn selects GENERIC_MSI_IRQ_DOMAIN. So all the dependencies on PCI_MSI_IRQ_DOMAIN are just an indirection to PCI_MSI. Match the reality and just admit that PCI_MSI requires GENERIC_MSI_IRQ_DOMAIN. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20221111122014.467556921@linutronix.de --- include/linux/msi.h | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 2dfd7b2bdfbe..9b8414523916 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -238,15 +238,6 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, } #endif -#ifdef CONFIG_PCI_MSI -struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); -void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg); -#else /* CONFIG_PCI_MSI */ -static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) -{ -} -#endif /* CONFIG_PCI_MSI */ - int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); void msi_free_msi_descs_range(struct device *dev, unsigned int first_index, unsigned int last_index); @@ -259,12 +250,6 @@ static inline void msi_free_msi_descs(struct device *dev) msi_free_msi_descs_range(dev, 0, MSI_MAX_INDEX); } -void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); -void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); - -void pci_msi_mask_irq(struct irq_data *data); -void pci_msi_unmask_irq(struct irq_data *data); - /* * The arch hooks to setup up msi irqs. Default functions are implemented * as weak symbols so that they /can/ be overriden by architecture specific @@ -468,18 +453,26 @@ void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int vir void *platform_msi_get_host_data(struct irq_domain *domain); #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +/* PCI specific interfaces */ +#ifdef CONFIG_PCI_MSI +struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); +void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg); +void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); +void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); +void pci_msi_mask_irq(struct irq_data *data); +void pci_msi_unmask_irq(struct irq_data *data); struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); bool pci_dev_has_special_msi_domain(struct pci_dev *pdev); -#else +#else /* CONFIG_PCI_MSI */ static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) { return NULL; } -#endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */ +static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) { } +#endif /* !CONFIG_PCI_MSI */ #endif /* LINUX_MSI_H */ -- cgit From 13e7accb81d6c07993385af8342238ff22b41ac8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:54:40 +0100 Subject: genirq: Get rid of GENERIC_MSI_IRQ_DOMAIN Adjust to reality and remove another layer of pointless Kconfig indirection. CONFIG_GENERIC_MSI_IRQ is good enough to serve all purposes. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122014.524842979@linutronix.de --- include/linux/device.h | 8 +++----- include/linux/gpio/driver.h | 2 +- include/linux/msi.h | 8 +++----- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 424b55df0272..c90a444be1c4 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -378,10 +378,8 @@ struct dev_links_info { * @data: Pointer to MSI device data */ struct dev_msi_info { -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN - struct irq_domain *domain; -#endif #ifdef CONFIG_GENERIC_MSI_IRQ + struct irq_domain *domain; struct msi_device_data *data; #endif }; @@ -742,7 +740,7 @@ static inline void set_dev_node(struct device *dev, int node) static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) { -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +#ifdef CONFIG_GENERIC_MSI_IRQ return dev->msi.domain; #else return NULL; @@ -751,7 +749,7 @@ static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d) { -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +#ifdef CONFIG_GENERIC_MSI_IRQ dev->msi.domain = d; #endif } diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 6aeea1071b1b..88ae4513abb5 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -27,7 +27,7 @@ struct gpio_chip; union gpio_irq_fwspec { struct irq_fwspec fwspec; -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +#ifdef CONFIG_GENERIC_MSI_IRQ msi_alloc_info_t msiinfo; #endif }; diff --git a/include/linux/msi.h b/include/linux/msi.h index 9b8414523916..8b287148e512 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -79,9 +79,7 @@ void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); #else -static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) -{ -} +static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) { } #endif typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc, @@ -278,7 +276,7 @@ static inline void msi_device_destroy_sysfs(struct device *dev) { } */ bool arch_restore_msi_irqs(struct pci_dev *dev); -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +#ifdef CONFIG_GENERIC_MSI_IRQ #include @@ -451,7 +449,7 @@ int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int vir void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nvec); void *platform_msi_get_host_data(struct irq_domain *domain); -#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ +#endif /* CONFIG_GENERIC_MSI_IRQ */ /* PCI specific interfaces */ #ifdef CONFIG_PCI_MSI -- cgit From 5c0997dc33ac24b7dc0124c2fc1caa37ae39461a Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 11 Nov 2022 14:54:50 +0100 Subject: PCI/MSI: Move pci_alloc_irq_vectors() to api.c To disentangle the maze in msi.c, all exported device-driver MSI APIs are now to be grouped in one file, api.c. Make pci_alloc_irq_vectors() a real function instead of wrapper and add proper kernel doc to it. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20221111122014.870888193@linutronix.de --- include/linux/pci.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 2bda4a4e47e8..243e48f79e82 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1553,6 +1553,8 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev, return rc; return 0; } +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags, struct irq_affinity *affd); @@ -1586,6 +1588,13 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, return 1; return -ENOSPC; } +static inline int +pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, + flags, NULL); +} static inline void pci_free_irq_vectors(struct pci_dev *dev) { @@ -1898,15 +1907,13 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, { return -ENOSPC; } -#endif /* CONFIG_PCI */ - static inline int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags) { - return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, - NULL); + return -ENOSPC; } +#endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit From 2569f62ca473584a8ba389f455fc00805389f7da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Nov 2022 14:55:15 +0100 Subject: genirq/msi: Remove msi_domain_ops:: Msi_check() No more users. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221111122015.807616900@linutronix.de --- include/linux/msi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 8b287148e512..1ce9d5e0bfa3 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -292,7 +292,6 @@ struct msi_domain_info; * @get_hwirq: Retrieve the resulting hw irq number * @msi_init: Domain specific init function for MSI interrupts * @msi_free: Domain specific function to free a MSI interrupts - * @msi_check: Callback for verification of the domain/info/dev data * @msi_prepare: Prepare the allocation of the interrupts in the domain * @set_desc: Set the msi descriptor for an interrupt * @domain_alloc_irqs: Optional function to override the default allocation @@ -330,9 +329,6 @@ struct msi_domain_ops { void (*msi_free)(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq); - int (*msi_check)(struct irq_domain *domain, - struct msi_domain_info *info, - struct device *dev); int (*msi_prepare)(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); -- cgit From 9eb8ca049c23afb0410f4a7b9a7158f1a0a3ad0e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 16 Nov 2022 16:16:57 -0800 Subject: KVM: Obey kvm.halt_poll_ns in VMs not using KVM_CAP_HALT_POLL Obey kvm.halt_poll_ns in VMs not using KVM_CAP_HALT_POLL on every halt, rather than just sampling the module parameter when the VM is first created. This restore the original behavior of kvm.halt_poll_ns for VMs that have not opted into KVM_CAP_HALT_POLL. Notably, this change restores the ability for admins to disable or change the maximum halt-polling time system wide for VMs not using KVM_CAP_HALT_POLL. Reported-by: Christian Borntraeger Fixes: acd05785e48c ("kvm: add capability for halt polling") Signed-off-by: David Matlack Message-Id: <20221117001657.1067231-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 18592bdf4c1b..637a60607c7d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -776,6 +776,7 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; + bool override_halt_poll_ns; unsigned int max_halt_poll_ns; u32 dirty_ring_size; bool vm_bugged; -- cgit From 3af43ba4c6019b29c048921eb8147eb010165329 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 16 Nov 2022 15:50:58 +0800 Subject: bpf: Pass map file to .map_update_batch directly Currently bpf_map_do_batch() first invokes fdget(batch.map_fd) to get the target map file, then it invokes generic_map_update_batch() to do batch update. generic_map_update_batch() will get the target map file by using fdget(batch.map_fd) again and pass it to bpf_map_update_value(). The problem is map file returned by the second fdget() may be NULL or a totally different file compared by map file in bpf_map_do_batch(). The reason is that the first fdget() only guarantees the liveness of struct file instead of file descriptor and the file description may be released by concurrent close() through pick_file(). It doesn't incur any problem as for now, because maps with batch update support don't use map file in .map_fd_get_ptr() ops. But it is better to fix the potential access of an invalid map file. Using __bpf_map_get() again in generic_map_update_batch() can not fix the problem, because batch.map_fd may be closed and reopened, and the returned map file may be different with map file got in bpf_map_do_batch(), so just passing the map file directly to .map_update_batch() in bpf_map_do_batch(). Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221116075059.1551277-1-houtao@huaweicloud.com --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54462dd28824..e60a5c052473 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -85,7 +85,8 @@ struct bpf_map_ops { int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); - int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, + int (*map_update_batch)(struct bpf_map *map, struct file *map_file, + const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -1789,7 +1790,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); -int generic_map_update_batch(struct bpf_map *map, +int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_delete_batch(struct bpf_map *map, -- cgit From e9a688bcb19348862afe30d7c85bc37c4c293471 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 8 Oct 2022 20:42:54 -0600 Subject: random: use rejection sampling for uniform bounded random integers Until the very recent commits, many bounded random integers were calculated using `get_random_u32() % max_plus_one`, which not only incurs the price of a division -- indicating performance mostly was not a real issue -- but also does not result in a uniformly distributed output if max_plus_one is not a power of two. Recent commits moved to using `prandom_u32_max(max_plus_one)`, which replaces the division with a faster multiplication, but still does not solve the issue with non-uniform output. For some users, maybe this isn't a problem, and for others, maybe it is, but for the majority of users, probably the question has never been posed and analyzed, and nobody thought much about it, probably assuming random is random is random. In other words, the unthinking expectation of most users is likely that the resultant numbers are uniform. So we implement here an efficient way of generating uniform bounded random integers. Through use of compile-time evaluation, and avoiding divisions as much as possible, this commit introduces no measurable overhead. At least for hot-path uses tested, any potential difference was lost in the noise. On both clang and gcc, code generation is pretty small. The new function, get_random_u32_below(), lives in random.h, rather than prandom.h, and has a "get_random_xxx" function name, because it is suitable for all uses, including cryptography. In order to be efficient, we implement a kernel-specific variant of Daniel Lemire's algorithm from "Fast Random Integer Generation in an Interval", linked below. The kernel's variant takes advantage of constant folding to avoid divisions entirely in the vast majority of cases, works on both 32-bit and 64-bit architectures, and requests a minimal amount of bytes from the RNG. Link: https://arxiv.org/pdf/1805.10941.pdf Cc: stable@vger.kernel.org # to ease future backports that use this api Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 18 ++---------------- include/linux/random.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index e0a0759dd09c..1f4a0de7b019 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -23,24 +23,10 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); #define prandom_init_once(pcpu_state) \ DO_ONCE(prandom_seed_full_state, (pcpu_state)) -/** - * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) - * @ep_ro: right open interval endpoint - * - * Returns a pseudo-random number that is in interval [0, ep_ro). This is - * useful when requesting a random index of an array containing ep_ro elements, - * for example. The result is somewhat biased when ep_ro is not a power of 2, - * so do not use this for cryptographic purposes. - * - * Returns: pseudo-random number in interval [0, ep_ro) - */ +/* Deprecated: use get_random_u32_below() instead. */ static inline u32 prandom_u32_max(u32 ep_ro) { - if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8) - return (get_random_u8() * ep_ro) >> 8; - if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16) - return (get_random_u16() * ep_ro) >> 16; - return ((u64)get_random_u32() * ep_ro) >> 32; + return get_random_u32_below(ep_ro); } /* diff --git a/include/linux/random.h b/include/linux/random.h index 147a5e0d0b8e..3a82c0a8bc46 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -51,6 +51,46 @@ static inline unsigned long get_random_long(void) #endif } +u32 __get_random_u32_below(u32 ceil); + +/* + * Returns a random integer in the interval [0, ceil), with uniform + * distribution, suitable for all uses. Fastest when ceil is a constant, but + * still fast for variable ceil as well. + */ +static inline u32 get_random_u32_below(u32 ceil) +{ + if (!__builtin_constant_p(ceil)) + return __get_random_u32_below(ceil); + + /* + * For the fast path, below, all operations on ceil are precomputed by + * the compiler, so this incurs no overhead for checking pow2, doing + * divisions, or branching based on integer size. The resultant + * algorithm does traditional reciprocal multiplication (typically + * optimized by the compiler into shifts and adds), rejecting samples + * whose lower half would indicate a range indivisible by ceil. + */ + BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0"); + if (ceil <= 1) + return 0; + for (;;) { + if (ceil <= 1U << 8) { + u32 mult = ceil * get_random_u8(); + if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil)) + return mult >> 8; + } else if (ceil <= 1U << 16) { + u32 mult = ceil * get_random_u16(); + if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil)) + return mult >> 16; + } else { + u64 mult = (u64)ceil * get_random_u32(); + if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil)) + return mult >> 32; + } + } +} + /* * On 64-bit architectures, protect against non-terminated C string overflows * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. -- cgit From 7f576b2593a978451416424e75f69ad1e3ae4efe Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 19 Oct 2022 23:19:35 -0600 Subject: random: add helpers for random numbers with given floor or range Now that we have get_random_u32_below(), it's nearly trivial to make inline helpers to compute get_random_u32_above() and get_random_u32_inclusive(), which will help clean up open coded loops and manual computations throughout the tree. One snag is that in order to make get_random_u32_inclusive() operate on closed intervals, we have to do some (unlikely) special case handling if get_random_u32_inclusive(0, U32_MAX) is called. The least expensive way of doing this is actually to adjust the slowpath of get_random_u32_below() to have its undefined 0 result just return the output of get_random_u32(). We can make this basically free by calling get_random_u32() before the branch, so that the branch latency gets interleaved. Cc: stable@vger.kernel.org # to ease future backports that use this api Reviewed-by: Kees Cook Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 3a82c0a8bc46..bd954ecbef90 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -91,6 +91,31 @@ static inline u32 get_random_u32_below(u32 ceil) } } +/* + * Returns a random integer in the interval (floor, U32_MAX], with uniform + * distribution, suitable for all uses. Fastest when floor is a constant, but + * still fast for variable floor as well. + */ +static inline u32 get_random_u32_above(u32 floor) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && floor == U32_MAX, + "get_random_u32_above() must take floor < U32_MAX"); + return floor + 1 + get_random_u32_below(U32_MAX - floor); +} + +/* + * Returns a random integer in the interval [floor, ceil], with uniform + * distribution, suitable for all uses. Fastest when floor and ceil are + * constant, but still fast for variable floor and ceil as well. + */ +static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && __builtin_constant_p(ceil) && + (floor > ceil || ceil - floor == U32_MAX), + "get_random_u32_inclusive() must take floor <= ceil"); + return floor + get_random_u32_below(ceil - floor + 1); +} + /* * On 64-bit architectures, protect against non-terminated C string overflows * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. -- cgit From 8032bf1233a74627ce69b803608e650f3f35971c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 9 Oct 2022 20:44:02 -0600 Subject: treewide: use get_random_u32_below() instead of deprecated function This is a simple mechanical transformation done by: @@ expression E; @@ - prandom_u32_max + get_random_u32_below (E) Reviewed-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Acked-by: Darrick J. Wong # for xfs Reviewed-by: SeongJae Park # for damon Reviewed-by: Jason Gunthorpe # for infiniband Reviewed-by: Russell King (Oracle) # for arm Acked-by: Ulf Hansson # for mmc Signed-off-by: Jason A. Donenfeld --- include/linux/damon.h | 2 +- include/linux/nodemask.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 620ada094c3b..84525b9cdf6e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -21,7 +21,7 @@ /* Get a random number in [l, r) */ static inline unsigned long damon_rand(unsigned long l, unsigned long r) { - return l + prandom_u32_max(r - l); + return l + get_random_u32_below(r - l); } /** diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index efef68c9352a..bb0ee80526b2 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -516,7 +516,7 @@ static inline int node_random(const nodemask_t *maskp) bit = first_node(*maskp); break; default: - bit = find_nth_bit(maskp->bits, MAX_NUMNODES, prandom_u32_max(w)); + bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w)); break; } return bit; -- cgit From b3883a9a1f09e7b41f4dcb1bbd7262216a62d253 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 23 Oct 2022 22:06:00 +0200 Subject: stackprotector: move get_random_canary() into stackprotector.h This has nothing to do with random.c and everything to do with stack protectors. Yes, it uses randomness. But many things use randomness. random.h and random.c are concerned with the generation of randomness, not with each and every use. So move this function into the more specific stackprotector.h file where it belongs. Acked-by: Greg Kroah-Hartman Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 19 ------------------- include/linux/stackprotector.h | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index bd954ecbef90..d1d680b9701d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -116,25 +116,6 @@ static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil) return floor + get_random_u32_below(ceil - floor + 1); } -/* - * On 64-bit architectures, protect against non-terminated C string overflows - * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. - */ -#ifdef CONFIG_64BIT -# ifdef __LITTLE_ENDIAN -# define CANARY_MASK 0xffffffffffffff00UL -# else /* big endian, 64 bits: */ -# define CANARY_MASK 0x00ffffffffffffffUL -# endif -#else /* 32 bits: */ -# define CANARY_MASK 0xffffffffUL -#endif - -static inline unsigned long get_random_canary(void) -{ - return get_random_long() & CANARY_MASK; -} - void __init random_init_early(const char *command_line); void __init random_init(void); bool rng_is_initialized(void); diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h index 4c678c4fec58..9c88707d9a0f 100644 --- a/include/linux/stackprotector.h +++ b/include/linux/stackprotector.h @@ -6,6 +6,25 @@ #include #include +/* + * On 64-bit architectures, protect against non-terminated C string overflows + * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. + */ +#ifdef CONFIG_64BIT +# ifdef __LITTLE_ENDIAN +# define CANARY_MASK 0xffffffffffffff00UL +# else /* big endian, 64 bits: */ +# define CANARY_MASK 0x00ffffffffffffffUL +# endif +#else /* 32 bits: */ +# define CANARY_MASK 0xffffffffUL +#endif + +static inline unsigned long get_random_canary(void) +{ + return get_random_long() & CANARY_MASK; +} + #if defined(CONFIG_STACKPROTECTOR) || defined(CONFIG_ARM64_PTR_AUTH) # include #else -- cgit From 2c03e16f44993d45f064580adb0eb408b513ad72 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 29 Oct 2022 01:18:04 +0200 Subject: random: remove early archrandom abstraction The arch_get_random*_early() abstraction is not completely useful and adds complexity, because it's not a given that there will be no calls to arch_get_random*() between random_init_early(), which uses arch_get_random*_early(), and init_cpu_features(). During that gap, crng_reseed() might be called, which uses arch_get_random*(), since it's mostly not init code. Instead we can test whether we're in the early phase in arch_get_random*() itself, and in doing so avoid all ambiguity about where we are. Fortunately, the only architecture that currently implements arch_get_random*_early() also has an alternatives-based cpu feature system, one flag of which determines whether the other flags have been initialized. This makes it possible to do the early check with zero cost once the system is initialized. Reviewed-by: Catalin Marinas Cc: Will Deacon Cc: Ard Biesheuvel Cc: Jean-Philippe Brucker Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index d1d680b9701d..9455d93f5ba3 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -154,26 +154,6 @@ declare_get_random_var_wait(long, unsigned long) #include -/* - * Called from the boot CPU during startup; not valid to call once - * secondary CPUs are up and preemption is possible. - */ -#ifndef arch_get_random_seed_longs_early -static inline size_t __init arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs) -{ - WARN_ON(system_state != SYSTEM_BOOTING); - return arch_get_random_seed_longs(v, max_longs); -} -#endif - -#ifndef arch_get_random_longs_early -static inline bool __init arch_get_random_longs_early(unsigned long *v, size_t max_longs) -{ - WARN_ON(system_state != SYSTEM_BOOTING); - return arch_get_random_longs(v, max_longs); -} -#endif - #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); -- cgit From db516da95ce458f0cab5e7ae2db93fd821484d7d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 6 Nov 2022 16:02:43 +0100 Subject: hw_random: use add_hwgenerator_randomness() for early entropy Rather than calling add_device_randomness(), the add_early_randomness() function should use add_hwgenerator_randomness(), so that the early entropy can be potentially credited, which allows for the RNG to initialize earlier without having to wait for the kthread to come up. This requires some minor API refactoring, by adding a `sleep_after` parameter to add_hwgenerator_randomness(), so that we don't hit a blocking sleep from add_early_randomness(). Tested-by: AngeloGioacchino Del Regno Tested-by: Marek Szyprowski Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Dominik Brodowski Acked-by: Herbert Xu Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 9455d93f5ba3..3b028c7ca065 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -17,7 +17,7 @@ void __init add_bootloader_randomness(const void *buf, size_t len); void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; void add_interrupt_randomness(int irq) __latent_entropy; -void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy); +void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after); #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) static inline void add_latent_entropy(void) -- cgit From d7bf7f3b813e3755226bcb5114ad2ac477514ebf Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 1 Jun 2022 22:45:33 +0200 Subject: random: always mix cycle counter in add_latent_entropy() add_latent_entropy() is called every time a process forks, in kernel_clone(). This in turn calls add_device_randomness() using the latent entropy global state. add_device_randomness() does two things: 2) Mixes into the input pool the latent entropy argument passed; and 1) Mixes in a cycle counter, a sort of measurement of when the event took place, the high precision bits of which are presumably difficult to predict. (2) is impossible without CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y. But (1) is always possible. However, currently CONFIG_GCC_PLUGIN_LATENT_ENTROPY=n disables both (1) and (2), instead of just (2). This commit causes the CONFIG_GCC_PLUGIN_LATENT_ENTROPY=n case to still do (1) by passing NULL (len 0) to add_device_randomness() when add_latent_ entropy() is called. Cc: Dominik Brodowski Cc: PaX Team Cc: Emese Revfy Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin") Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 3b028c7ca065..579117d83eb8 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -19,14 +19,14 @@ void add_input_randomness(unsigned int type, unsigned int code, void add_interrupt_randomness(int irq) __latent_entropy; void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after); -#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) static inline void add_latent_entropy(void) { +#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); -} #else -static inline void add_latent_entropy(void) { } + add_device_randomness(NULL, 0); #endif +} #if IS_ENABLED(CONFIG_VMGENID) void add_vmfork_randomness(const void *unique_vm_id, size_t len); -- cgit From 282de143ead96a5d53331e946f31c977b4610a74 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:55 +0530 Subject: bpf: Introduce allocated objects support Introduce support for representing pointers to objects allocated by the BPF program, i.e. PTR_TO_BTF_ID that point to a type in program BTF. This is indicated by the presence of MEM_ALLOC type flag in reg->type to avoid having to check btf_is_kernel when trying to match argument types in helpers. Whenever walking such types, any pointers being walked will always yield a SCALAR instead of pointer. In the future we might permit kptr inside such allocated objects (either kernel or program allocated), and it will then form a PTR_TO_BTF_ID of the respective type. For now, such allocated objects will always be referenced in verifier context, hence ref_obj_id == 0 for them is a bug. It is allowed to write to such objects, as long fields that are special are not touched (support for which will be added in subsequent patches). Note that once such a pointer is marked PTR_UNTRUSTED, it is no longer allowed to write to it. No PROBE_MEM handling is therefore done for loads into this type unless PTR_UNTRUSTED is part of the register type, since they can never be in an undefined state, and their lifetime will always be valid. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e60a5c052473..7440c20c4192 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -525,6 +525,11 @@ enum bpf_type_flag { /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), + /* MEM is of an allocated object of type in program BTF. This is used to + * tag PTR_TO_BTF_ID allocated using bpf_obj_new. + */ + MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -2792,4 +2797,10 @@ struct bpf_key { bool has_ref; }; #endif /* CONFIG_KEYS */ + +static inline bool type_is_alloc(u32 type) +{ + return type & MEM_ALLOC; +} + #endif /* _LINUX_BPF_H */ -- cgit From 8ffa5cc142137a59d6a10eb5273fa2ba5dcd4947 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:56 +0530 Subject: bpf: Recognize lock and list fields in allocated objects Allow specifying bpf_spin_lock, bpf_list_head, bpf_list_node fields in a allocated object. Also update btf_struct_access to reject direct access to these special fields. A bpf_list_head allows implementing map-in-map style use cases, where an allocated object with bpf_list_head is linked into a list in a map value. This would require embedding a bpf_list_node, support for which is also included. The bpf_spin_lock is used to protect the bpf_list_head and other data. While we strictly don't require to hold a bpf_spin_lock while touching the bpf_list_head in such objects, as when have access to it, we have complete ownership of the object, the locking constraint is still kept and may be conditionally lifted in the future. Note that the specification of such types can be done just like map values, e.g.: struct bar { struct bpf_list_node node; }; struct foo { struct bpf_spin_lock lock; struct bpf_list_head head __contains(bar, node); struct bpf_list_node node; }; struct map_value { struct bpf_spin_lock lock; struct bpf_list_head head __contains(foo, node); }; To recognize such types in user BTF, we build a btf_struct_metas array of metadata items corresponding to each BTF ID. This is done once during the btf_parse stage to avoid having to do it each time during the verification process's requirement to inspect the metadata. Moreover, the computed metadata needs to be passed to some helpers in future patches which requires allocating them and storing them in the BTF that is pinned by the program itself, so that valid access can be assumed to such data during program runtime. A key thing to note is that once a btf_struct_meta is available for a type, both the btf_record and btf_field_offs should be available. It is critical that btf_field_offs is available in case special fields are present, as we extensively rely on special fields being zeroed out in map values and allocated objects in later patches. The code ensures that by bailing out in case of errors and ensuring both are available together. If the record is not available, the special fields won't be recognized, so not having both is also fine (in terms of being a verification error and not a runtime bug). Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/btf.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7440c20c4192..eb6ea53fa5a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -177,6 +177,7 @@ enum btf_field_type { BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), + BPF_LIST_NODE = (1 << 5), }; struct btf_field_kptr { @@ -277,6 +278,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "kptr"; case BPF_LIST_HEAD: return "bpf_list_head"; + case BPF_LIST_NODE: + return "bpf_list_node"; default: WARN_ON_ONCE(1); return "unknown"; @@ -295,6 +298,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); + case BPF_LIST_NODE: + return sizeof(struct bpf_list_node); default: WARN_ON_ONCE(1); return 0; @@ -313,6 +318,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); + case BPF_LIST_NODE: + return __alignof__(struct bpf_list_node); default: WARN_ON_ONCE(1); return 0; diff --git a/include/linux/btf.h b/include/linux/btf.h index d80345fa566b..a01a8da20021 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -78,6 +80,17 @@ struct btf_id_dtor_kfunc { u32 kfunc_btf_id; }; +struct btf_struct_meta { + u32 btf_id; + struct btf_record *record; + struct btf_field_offs *field_offs; +}; + +struct btf_struct_metas { + u32 cnt; + struct btf_struct_meta types[]; +}; + typedef void (*btf_dtor_kfunc_t)(void *); extern const struct file_operations btf_fops; @@ -408,6 +421,23 @@ static inline struct btf_param *btf_params(const struct btf_type *t) return (struct btf_param *)(t + 1); } +static inline int btf_id_cmp_func(const void *a, const void *b) +{ + const int *pa = a, *pb = b; + + return *pa - *pb; +} + +static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id) +{ + return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; +} + +static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) +{ + return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); +} + #ifdef CONFIG_BPF_SYSCALL struct bpf_prog; @@ -423,6 +453,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); +struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -454,6 +485,10 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt { return 0; } +static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) +{ + return NULL; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit From 865ce09a49d79d2b2c1d980f4c05ffc0b3517bdc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:57 +0530 Subject: bpf: Verify ownership relationships for user BTF types Ensure that there can be no ownership cycles among different types by way of having owning objects that can hold some other type as their element. For instance, a map value can only hold allocated objects, but these are allowed to have another bpf_list_head. To prevent unbounded recursion while freeing resources, elements of bpf_list_head in local kptrs can never have a bpf_list_head which are part of list in a map value. Later patches will verify this by having dedicated BTF selftests. Also, to make runtime destruction easier, once btf_struct_metas is fully populated, we can stash the metadata of the value type directly in the metadata of the list_head fields, as that allows easier access to the value type's layout to destruct it at runtime from the btf_field entry of the list head itself. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/btf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb6ea53fa5a2..323985a39ece 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -191,6 +191,7 @@ struct btf_field_list_head { struct btf *btf; u32 value_btf_id; u32 node_offset; + struct btf_record *value_rec; }; struct btf_field { diff --git a/include/linux/btf.h b/include/linux/btf.h index a01a8da20021..42d8f3730a8d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -178,6 +178,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); -- cgit From d0d78c1df9b1dbfb5e172de473561ce09d5e9d39 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:59 +0530 Subject: bpf: Allow locking bpf_spin_lock global variables Global variables reside in maps accessible using direct_value_addr callbacks, so giving each load instruction's rewrite a unique reg->id disallows us from holding locks which are global. The reason for preserving reg->id as a unique value for registers that may point to spin lock is that two separate lookups are treated as two separate memory regions, and any possible aliasing is ignored for the purposes of spin lock correctness. This is not great especially for the global variable case, which are served from maps that have max_entries == 1, i.e. they always lead to map values pointing into the same map value. So refactor the active_spin_lock into a 'active_lock' structure which represents the lock identity, and instead of the reg->id, remember two fields, a pointer and the reg->id. The pointer will store reg->map_ptr or reg->btf. It's only necessary to distinguish for the id == 0 case of global variables, but always setting the pointer to a non-NULL value and using the pointer to check whether the lock is held simplifies code in the verifier. This is generic enough to allow it for global variables, map lookups, and allocated objects at the same time. Note that while whether a lock is held can be answered by just comparing active_lock.ptr to NULL, to determine whether the register is pointing to the same held lock requires comparing _both_ ptr and id. Finally, as a result of this refactoring, pseudo load instructions are not given a unique reg->id, as they are doing lookup for the same map value (max_entries is never greater than 1). Essentially, we consider that the tuple of (ptr, id) will always be unique for any kind of argument to bpf_spin_{lock,unlock}. Note that this can be extended in the future to also remember offset used for locking, so that we can introduce multiple bpf_spin_lock fields in the same allocation. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-10-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1a32baa78ce2..1db2b4dc7009 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -323,7 +323,21 @@ struct bpf_verifier_state { u32 branches; u32 insn_idx; u32 curframe; - u32 active_spin_lock; + /* For every reg representing a map value or allocated object pointer, + * we consider the tuple of (ptr, id) for them to be unique in verifier + * context and conside them to not alias each other for the purposes of + * tracking lock state. + */ + struct { + /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, + * there's no active lock held, and other fields have no + * meaning. If non-NULL, it indicates that a lock is held and + * id member has the reg->id of the register which can be >= 0. + */ + void *ptr; + /* This will be reg->id */ + u32 id; + } active_lock; bool speculative; /* first and last insn idx of this verifier state */ -- cgit From 00b85860feb809852af9a88cb4ca8766d7dff6a3 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:01 +0530 Subject: bpf: Rewrite kfunc argument handling As we continue to add more features, argument types, kfunc flags, and different extensions to kfuncs, the code to verify the correctness of the kfunc prototype wrt the passed in registers has become ad-hoc and ugly to read. To make life easier, and make a very clear split between different stages of argument processing, move all the code into verifier.c and refactor into easier to read helpers and functions. This also makes sharing code within the verifier easier with kfunc argument processing. This will be more and more useful in later patches as we are now moving to implement very core BPF helpers as kfuncs, to keep them experimental before baking into UAPI. Remove all kfunc related bits now from btf_check_func_arg_match, as users have been converted away to refactored kfunc argument handling. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-12-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ----------- include/linux/bpf_verifier.h | 2 -- include/linux/btf.h | 31 ++++++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 323985a39ece..0a74df731eb8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2109,22 +2109,11 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); -struct bpf_kfunc_arg_meta { - u64 r0_size; - bool r0_rdonly; - int ref_obj_id; - u32 flags; -}; - struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); -int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1db2b4dc7009..fb146b0ce006 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -603,8 +603,6 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type); -int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, diff --git a/include/linux/btf.h b/include/linux/btf.h index 42d8f3730a8d..d5b26380a60f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -338,6 +338,16 @@ static inline bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static inline bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + +static inline bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -439,9 +449,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); } -#ifdef CONFIG_BPF_SYSCALL struct bpf_prog; +struct bpf_verifier_log; +#ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); @@ -455,6 +466,12 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); +const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg); +bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -490,6 +507,18 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf { return NULL; } +static inline const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) +{ + return NULL; +} +static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) +{ + return false; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit From 958cf2e273f0929c66169e0788031310e8118722 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:03 +0530 Subject: bpf: Introduce bpf_obj_new Introduce type safe memory allocator bpf_obj_new for BPF programs. The kernel side kfunc is named bpf_obj_new_impl, as passing hidden arguments to kfuncs still requires having them in prototype, unlike BPF helpers which always take 5 arguments and have them checked using bpf_func_proto in verifier, ignoring unset argument types. Introduce __ign suffix to ignore a specific kfunc argument during type checks, then use this to introduce support for passing type metadata to the bpf_obj_new_impl kfunc. The user passes BTF ID of the type it wants to allocates in program BTF, the verifier then rewrites the first argument as the size of this type, after performing some sanity checks (to ensure it exists and it is a struct type). The second argument is also fixed up and passed by the verifier. This is the btf_struct_meta for the type being allocated. It would be needed mostly for the offset array which is required for zero initializing special fields while leaving the rest of storage in unitialized state. It would also be needed in the next patch to perform proper destruction of the object's special fields. Under the hood, bpf_obj_new will call bpf_mem_alloc and bpf_mem_free, using the any context BPF memory allocator introduced recently. To this end, a global instance of the BPF memory allocator is initialized on boot to be used for this purpose. This 'bpf_global_ma' serves all allocations for bpf_obj_new. In the future, bpf_obj_new variants will allow specifying a custom allocator. Note that now that bpf_obj_new can be used to allocate objects that can be linked to BPF linked list (when future linked list helpers are available), we need to also free the elements using bpf_mem_free. However, since the draining of elements is done outside the bpf_spin_lock, we need to do migrate_disable around the call since bpf_list_head_free can be called from map free path where migration is enabled. Otherwise, when called from BPF programs migration is already disabled. A convenience macro is included in the bpf_experimental.h header to hide over the ugly details of the implementation, leading to user code looking similar to a language level extension which allocates and constructs fields of a user type. struct bar { struct bpf_list_node node; }; struct foo { struct bpf_spin_lock lock; struct bpf_list_head head __contains(bar, node); }; void prog(void) { struct foo *f; f = bpf_obj_new(typeof(*f)); if (!f) return; ... } A key piece of this story is still missing, i.e. the free function, which will come in the next patch. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-14-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 21 +++++++++++++-------- include/linux/bpf_verifier.h | 2 ++ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0a74df731eb8..8b32376ce746 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -54,6 +54,8 @@ struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; +extern struct bpf_mem_alloc bpf_global_ma; +extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, @@ -334,16 +336,19 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) { - if (!IS_ERR_OR_NULL(map->record)) { - struct btf_field *fields = map->record->fields; - u32 cnt = map->record->cnt; - int i; + int i; - for (i = 0; i < cnt; i++) - memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); - } + if (!foffs) + return; + for (i = 0; i < foffs->cnt; i++) + memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); +} + +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_init(map->field_offs, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fb146b0ce006..3dc72d396dfc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -433,6 +433,8 @@ struct bpf_insn_aux_data { */ struct bpf_loop_inline_state loop_inline_state; }; + u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ -- cgit From 534e86bc6c66e1e0c798a1c0a6a680bb231c08db Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:07 +0530 Subject: bpf: Add 'release on unlock' logic for bpf_list_push_{front,back} This commit implements the delayed release logic for bpf_list_push_front and bpf_list_push_back. Once a node has been added to the list, it's pointer changes to PTR_UNTRUSTED. However, it is only released once the lock protecting the list is unlocked. For such PTR_TO_BTF_ID | MEM_ALLOC with PTR_UNTRUSTED set but an active ref_obj_id, it is still permitted to read them as long as the lock is held. Writing to them is not allowed. This allows having read access to push items we no longer own until we release the lock guarding the list, allowing a little more flexibility when working with these APIs. Note that enabling write support has fairly tricky interactions with what happens inside the critical section. Just as an example, currently, bpf_obj_drop is not permitted, but if it were, being able to write to the PTR_UNTRUSTED pointer while the object gets released back to the memory allocator would violate safety properties we wish to guarantee (i.e. not crashing the kernel). The memory could be reused for a different type in the BPF program or even in the kernel as it gets eventually kfree'd. Not enabling bpf_obj_drop inside the critical section would appear to prevent all of the above, but that is more of an artifical limitation right now. Since the write support is tangled with how we handle potential aliasing of nodes inside the critical section that may or may not be part of the list anymore, it has been deferred to a future patch. Acked-by: Dave Marchevsky Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-18-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3dc72d396dfc..23f30c685f28 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -223,6 +223,11 @@ struct bpf_reference_state { * exiting a callback function. */ int callback_ref; + /* Mark the reference state to release the registers sharing the same id + * on bpf_spin_unlock (for nodes that we will lose ownership to but are + * safe to access inside the critical section). + */ + bool release_on_unlock; }; /* state of the program: -- cgit From f20a0a0519f35a3a34236ed2d38b067c5cb22b9f Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Nov 2022 02:18:28 +0900 Subject: ethtool: doc: clarify what drivers can implement in their get_drvinfo() Many of the drivers which implement ethtool_ops::get_drvinfo() will prints the .driver, .version or .bus_info of struct ethtool_drvinfo. To have a glance of current state, do: $ git grep -W "get_drvinfo(struct" Printing in those three fields is useless because: - since [1], the driver version should be the kernel version (at least for upstream drivers). Arguably, out of tree drivers might still want to set a custom version, but out of tree is not our focus. - since [2], the core is able to provide default values for .driver and .bus_info. In summary, drivers may provide .fw_version and .erom_version, the rest is expected to be done by the core. In struct ethtool_ops doc from linux/ethtool: rephrase field get_drvinfo() doc to discourage developers from implementing this callback. In struct ethtool_drvinfo doc from uapi/linux/ethtool.h: remove the paragraph mentioning what drivers should do. Rationale: no need to repeat what is already written in struct ethtool_ops doc. But add a note that .fw_version and .erom_version are driver defined. Also update the dummy driver and simply remove the callback in order not to confuse the newcomers: most of the drivers will not need this callback function any more. [1] commit 6a7e25c7fb48 ("net/core: Replace driver version to be kernel version") Link: https://git.kernel.org/torvalds/linux/c/6a7e25c7fb48 [2] commit edaf5df22cb8 ("ethtool: ethtool_get_drvinfo: populate drvinfo fields even if callback exits") Link: https://git.kernel.org/netdev/net-next/c/edaf5df22cb8 Reviewed-by: Leon Romanovsky Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20221116171828.4093-1-mailhol.vincent@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 5c51c7fda32a..9e0a76fc7de9 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -473,10 +473,10 @@ struct ethtool_module_power_mode_params { * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. - * @get_drvinfo: Report driver/device information. Should only set the - * @driver, @version, @fw_version and @bus_info fields. If not - * implemented, the @driver and @bus_info fields will be filled in - * according to the netdev's parent device. + * @get_drvinfo: Report driver/device information. Modern drivers no + * longer have to implement this callback. Most fields are + * correctly filled in by the core using system information, or + * populated using other driver operations. * @get_regs_len: Get buffer length required for @get_regs * @get_regs: Get device registers * @get_wol: Report whether Wake-on-Lan is enabled -- cgit From 647541ea06a7bbbcf941e501c726b3e26328c102 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Nov 2022 10:40:21 -0500 Subject: sctp: move SCTP_PAD4 and SCTP_TRUNC4 to linux/sctp.h Move these two macros from net/sctp/sctp.h to linux/sctp.h, so that it will be enough to include only linux/sctp.h in nft_exthdr.c and xt_sctp.c. It should not include "net/sctp/sctp.h" if a module does not have a dependence on SCTP module. Signed-off-by: Xin Long Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/ef6468a687f36da06f575c2131cd4612f6b7be88.1668526821.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/sctp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index a86e852507b3..358dc08e0831 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -820,4 +820,9 @@ struct sctp_new_encap_port_hdr { __be16 new_port; }; +/* Round an int up to the next multiple of 4. */ +#define SCTP_PAD4(s) (((s)+3)&~3) +/* Truncate to the previous multiple of 4. */ +#define SCTP_TRUNC4(s) ((s)&~3) + #endif /* __LINUX_SCTP_H__ */ -- cgit From b76f64caa42e9bb78e177ae94e1062d876a401b1 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 17 Nov 2022 22:16:16 -0800 Subject: Input: max8997 - convert to modern way to get a reference to a PWM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwm_request() isn't recommended to be used any more because it relies on global IDs for the PWM which comes with different difficulties. The new way to do things is to find the right PWM using a reference from the platform device. (This can be created either using a device-tree or a platform lookup table, see e.g. commit 5a4412d4a82f ("ARM: pxa: tavorevb: Use PWM lookup table") how to do this.) There are no in-tree users, so there are no other code locations that need adaption. Signed-off-by: Uwe Kleine-König Acked-by: Lee Jones Link: https://lore.kernel.org/r/20221117073543.3790449-1-u.kleine-koenig@pengutronix.de Signed-off-by: Dmitry Torokhov --- include/linux/mfd/max8997.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h index 6c98edcf4b0b..6193905abbb5 100644 --- a/include/linux/mfd/max8997.h +++ b/include/linux/mfd/max8997.h @@ -110,8 +110,6 @@ enum max8997_haptic_pwm_divisor { /** * max8997_haptic_platform_data - * @pwm_channel_id: channel number of PWM device - * valid for MAX8997_EXTERNAL_MODE * @pwm_period: period in nano second for PWM device * valid for MAX8997_EXTERNAL_MODE * @type: motor type @@ -128,7 +126,6 @@ enum max8997_haptic_pwm_divisor { * [0 - 255]: available period */ struct max8997_haptic_platform_data { - unsigned int pwm_channel_id; unsigned int pwm_period; enum max8997_haptic_motor_type type; -- cgit From 70912985545adc81716164271401c9ffb0acdd0f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 26 Sep 2022 21:32:16 +0200 Subject: efi: libstub: Implement devicepath support for initrd commandline loader Currently, the initrd= command line option to the EFI stub only supports loading files that reside on the same volume as the loaded image, which is not workable for loaders like GRUB that don't even implement the volume abstraction (EFI_SIMPLE_FILE_SYSTEM_PROTOCOL), and load the kernel from an anonymous buffer in memory. For this reason, another method was devised that relies on the LoadFile2 protocol. However, the command line loader is rather useful when using the UEFI shell or other generic loaders that have no awareness of Linux specific protocols so let's make it a bit more flexible, by permitting textual device paths to be provided to initrd= as well, provided that they refer to a file hosted on a EFI_SIMPLE_FILE_SYSTEM_PROTOCOL volume. E.g., initrd=PciRoot(0x0)/Pci(0x3,0x0)/HD(1,MBR,0xBE1AFDFA,0x3F,0xFBFC1)/rootfs.cpio.gz Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 2d4d5a7fd781..ee034f0c5cc2 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -371,6 +371,7 @@ void efi_native_runtime_setup(void); #define LOADED_IMAGE_DEVICE_PATH_PROTOCOL_GUID EFI_GUID(0xbc62157e, 0x3e33, 0x4fec, 0x99, 0x20, 0x2d, 0x3b, 0x36, 0xd7, 0x50, 0xdf) #define EFI_DEVICE_PATH_PROTOCOL_GUID EFI_GUID(0x09576e91, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID EFI_GUID(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c) +#define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID EFI_GUID(0x05c99a21, 0xc70f, 0x4ad2, 0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e) #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID EFI_GUID(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a) #define EFI_UGA_PROTOCOL_GUID EFI_GUID(0x982c298b, 0xf4fa, 0x41cb, 0xb8, 0x38, 0x77, 0xaa, 0x68, 0x8f, 0xb8, 0x39) #define EFI_PCI_IO_PROTOCOL_GUID EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5, 0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a) @@ -1011,6 +1012,11 @@ struct efi_mem_mapped_dev_path { u64 ending_addr; } __packed; +struct efi_file_path_dev_path { + struct efi_generic_dev_path header; + efi_char16_t filename[]; +} __packed; + struct efi_dev_path { union { struct efi_generic_dev_path header; -- cgit From 4059ba656ce5c13c8ca345955712152ae41420c8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Oct 2022 18:38:15 +0200 Subject: efi: memmap: Move EFI fake memmap support into x86 arch tree The EFI fake memmap support is specific to x86, which manipulates the EFI memory map in various different ways after receiving it from the EFI stub. On other architectures, we have managed to push back on this, and the EFI memory map is kept pristine. So let's move the fake memmap code into the x86 arch tree, where it arguably belongs. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index ee034f0c5cc2..43146530374e 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -749,12 +749,6 @@ extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; extern bool efi_poweroff_required(void); -#ifdef CONFIG_EFI_FAKE_MEMMAP -extern void __init efi_fake_memmap(void); -#else -static inline void efi_fake_memmap(void) { } -#endif - extern unsigned long efi_mem_attr_table; /* -- cgit From fdc6d38d64a20c542b1867ebeb8dd03b98829336 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Oct 2022 19:09:24 +0200 Subject: efi: memmap: Move manipulation routines into x86 arch tree The EFI memory map is a description of the memory layout as provided by the firmware, and only x86 manipulates it in various different ways for its own memory bookkeeping. So let's move the memmap routines that are only used by x86 into the x86 arch tree. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 43146530374e..6a0bdef8375d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -707,18 +707,10 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); -extern int __init efi_memmap_alloc(unsigned int num_entries, - struct efi_memory_map_data *data); -extern void __efi_memmap_free(u64 phys, unsigned long size, - unsigned long flags); +extern int __init __efi_memmap_init(struct efi_memory_map_data *data); extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size); extern void __init efi_memmap_unmap(void); -extern int __init efi_memmap_install(struct efi_memory_map_data *data); -extern int __init efi_memmap_split_count(efi_memory_desc_t *md, - struct range *range); -extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, - void *buf, struct efi_mem_range *mem); #ifdef CONFIG_EFI_ESRT extern void __init efi_esrt_init(void); -- cgit From 1fff234de2b6047ee311c6bf88f31ad5008f2889 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Nov 2022 09:17:16 +0100 Subject: efi: x86: Move EFI runtime map sysfs code to arch/x86 The EFI runtime map code is only wired up on x86, which is the only architecture that has a need for it in its implementation of kexec. So let's move this code under arch/x86 and drop all references to it from generic code. To ensure that the efi_runtime_map_init() is invoked at the appropriate time use a 'sync' subsys_initcall() that will be called right after the EFI initcall made from generic code where the original invocation of efi_runtime_map_init() resided. Signed-off-by: Ard Biesheuvel Reviewed-by: Dave Young --- include/linux/efi.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 6a0bdef8375d..400edf77bb12 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1089,34 +1089,6 @@ extern int efi_capsule_update(efi_capsule_header_t *capsule, static inline bool efi_capsule_pending(int *reset_type) { return false; } #endif -#ifdef CONFIG_EFI_RUNTIME_MAP -int efi_runtime_map_init(struct kobject *); -int efi_get_runtime_map_size(void); -int efi_get_runtime_map_desc_size(void); -int efi_runtime_map_copy(void *buf, size_t bufsz); -#else -static inline int efi_runtime_map_init(struct kobject *kobj) -{ - return 0; -} - -static inline int efi_get_runtime_map_size(void) -{ - return 0; -} - -static inline int efi_get_runtime_map_desc_size(void) -{ - return 0; -} - -static inline int efi_runtime_map_copy(void *buf, size_t bufsz) -{ - return 0; -} - -#endif - #ifdef CONFIG_EFI extern bool efi_runtime_disabled(void); #else -- cgit From 2fb6999dd06f3718dcca2cf9b11ea6c9ea9da833 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Fri, 28 Oct 2022 20:09:50 +0000 Subject: efi/cper, cxl: Decode CXL Error Log Print the CXL Error Log field as found in CXL Protocol Error Section. The CXL RAS Capability structure will be reused by OS First Handling and the duplication/appropriate placement will be addressed eventually. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Signed-off-by: Ard Biesheuvel --- include/linux/cxl_err.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 include/linux/cxl_err.h (limited to 'include/linux') diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h new file mode 100644 index 000000000000..629e1bdeda44 --- /dev/null +++ b/include/linux/cxl_err.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Advanced Micro Devices, Inc. + * + * Author: Smita Koralahalli + */ + +#ifndef LINUX_CXL_ERR_H +#define LINUX_CXL_ERR_H + +/* CXL RAS Capability Structure, CXL v3.1 sec 8.2.4.16 */ +struct cxl_ras_capability_regs { + u32 uncor_status; + u32 uncor_mask; + u32 uncor_severity; + u32 cor_status; + u32 cor_mask; + u32 cap_control; + u32 header_log[16]; +}; + +#endif //__CXL_ERR_ -- cgit From 16bdbae394280f1d97933d919023eccbf0b564bd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 7 Nov 2022 13:24:55 +0100 Subject: hwrng: core - treat default_quality as a maximum and default to 1024 Most hw_random devices return entropy which is assumed to be of full quality, but driver authors don't bother setting the quality knob. Some hw_random devices return less than full quality entropy, and then driver authors set the quality knob. Therefore, the entropy crediting should be opt-out rather than opt-in per-driver, to reflect the actual reality on the ground. For example, the two Raspberry Pi RNG drivers produce full entropy randomness, and both EDK2 and U-Boot's drivers for these treat them as such. The result is that EFI then uses these numbers and passes the to Linux, and Linux credits them as boot, thereby initializing the RNG. Yet, in Linux, the quality knob was never set to anything, and so on the chance that Linux is booted without EFI, nothing is ever credited. That's annoying. The same pattern appears to repeat itself throughout various drivers. In fact, very very few drivers have bothered setting quality=1024. Looking at the git history of existing drivers and corresponding mailing list discussion, this conclusion tracks. There's been a decent amount of discussion about drivers that set quality < 1024 -- somebody read and interepreted a datasheet, or made some back of the envelope calculation somehow. But there's been very little, if any, discussion about most drivers where the quality is just set to 1024 or unset (or set to 1000 when the authors misunderstood the API and assumed it was base-10 rather than base-2); in both cases the intent was fairly clear of, "this is a hardware random device; it's fine." So let's invert this logic. A hw_random struct's quality knob now controls the maximum quality a driver can produce, or 0 to specify 1024. Then, the module-wide switch called "default_quality" is changed to represent the maximum quality of any driver. By default it's 1024, and the quality of any particular driver is then given by: min(default_quality, rng->quality ?: 1024); This way, the user can still turn this off for weird reasons (and we can replace whatever driver-specific disabling hacks existed in the past), yet we get proper crediting for relevant RNGs. Cc: Dominik Brodowski Cc: Ard Biesheuvel Cc: Herbert Xu Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- include/linux/hw_random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index 77c2885c4c13..8a3115516a1b 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -34,7 +34,7 @@ * @priv: Private data, for use by the RNG driver. * @quality: Estimation of true entropy in RNG's bitstream * (in bits of entropy per 1024 bits of input; - * valid values: 1 to 1024, or 0 for unknown). + * valid values: 1 to 1024, or 0 for maximum). */ struct hwrng { const char *name; -- cgit From cc7710d0d4ebc6998f04035cde4f32c5ddbe9d7f Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 11 Nov 2022 18:00:36 +0800 Subject: crypto: hisilicon/qm - add missing pci_dev_put() in q_num_set() pci_get_device() will increase the reference count for the returned pci_dev. We need to use pci_dev_put() to decrease the reference count before q_num_set() returns. Fixes: c8b4b477079d ("crypto: hisilicon - add HiSilicon HPRE accelerator") Signed-off-by: Xiongfeng Wang Reviewed-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index e230c7c46110..c3618255b150 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -384,14 +384,14 @@ struct hisi_qp { static inline int q_num_set(const char *val, const struct kernel_param *kp, unsigned int device) { - struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, - device, NULL); + struct pci_dev *pdev; u32 n, q_num; int ret; if (!val) return -EINVAL; + pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL); if (!pdev) { q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2); pr_info("No device found currently, suppose queue number is %u\n", @@ -401,6 +401,8 @@ static inline int q_num_set(const char *val, const struct kernel_param *kp, q_num = QM_QNUM_V1; else q_num = QM_QNUM_V2; + + pci_dev_put(pdev); } ret = kstrtou32(val, 10, &n); -- cgit From b40b62ed7b0ffe8eb2e6fe8bcfb47027c9a93e93 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 12 Nov 2022 02:12:51 +0000 Subject: crypto: hisilicon/qm - modify the process of regs dfx The last register logic and different register logic are combined. Use "u32" instead of 'int' in the regs function input parameter to simplify some checks. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c3618255b150..be3aedaa96dc 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -471,11 +471,11 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen); int hisi_qm_sriov_configure(struct pci_dev *pdev, int num_vfs); void hisi_qm_dev_err_init(struct hisi_qm *qm); void hisi_qm_dev_err_uninit(struct hisi_qm *qm); -int hisi_qm_diff_regs_init(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, int reg_len); -void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len); +int hisi_qm_regs_debugfs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, u32 reg_len); +void hisi_qm_regs_debugfs_uninit(struct hisi_qm *qm, u32 reg_len); void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, - struct dfx_diff_registers *dregs, int regs_len); + struct dfx_diff_registers *dregs, u32 regs_len); pci_ers_result_t hisi_qm_dev_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -- cgit From 196dff2712ca5a2e651977bb2fe6b05474111a83 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 20 Oct 2022 10:39:10 +0200 Subject: efi: random: combine bootloader provided RNG seed with RNG protocol output Instead of blindly creating the EFI random seed configuration table if the RNG protocol is implemented and works, check whether such a EFI configuration table was provided by an earlier boot stage and if so, concatenate the existing and the new seeds, leaving it up to the core code to mix it in and credit it the way it sees fit. This can be used for, e.g., systemd-boot, to pass an additional seed to Linux in a way that can be consumed by the kernel very early. In that case, the following definitions should be used to pass the seed to the EFI stub: struct linux_efi_random_seed { u32 size; // of the 'seed' array in bytes u8 seed[]; }; The memory for the struct must be allocated as EFI_ACPI_RECLAIM_MEMORY pool memory, and the address of the struct in memory should be installed as a EFI configuration table using the following GUID: LINUX_EFI_RANDOM_SEED_TABLE_GUID 1ce1e5bc-7ceb-42f2-81e5-8aadf180f57b Note that doing so is safe even on kernels that were built without this patch applied, but the seed will simply be overwritten with a seed derived from the EFI RNG protocol, if available. The recommended seed size is 32 bytes, and seeds larger than 512 bytes are considered corrupted and ignored entirely. In order to preserve forward secrecy, seeds from previous bootloaders are memzero'd out, and in order to preserve memory, those older seeds are also freed from memory. Freeing from memory without first memzeroing is not safe to do, as it's possible that nothing else will ever overwrite those pages used by EFI. Reviewed-by: Jason A. Donenfeld [ardb: incorporate Jason's followup changes to extend the maximum seed size on the consumer end, memzero() it and drop a needless printk] Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 400edf77bb12..4b27519143f5 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1133,8 +1133,6 @@ void efi_check_for_embedded_firmwares(void); static inline void efi_check_for_embedded_firmwares(void) { } #endif -efi_status_t efi_random_get_seed(void); - #define arch_efi_call_virt(p, f, args...) ((p)->f(args)) /* -- cgit From 489d144563f23911262a652234b80c70c89c978b Mon Sep 17 00:00:00 2001 From: Christian Löhle Date: Thu, 17 Nov 2022 14:42:09 +0000 Subject: mmc: core: Fix ambiguous TRIM and DISCARD arg Clean up the MMC_TRIM_ARGS define that became ambiguous with DISCARD introduction. While at it, let's fix one usage where MMC_TRIM_ARGS falsely included DISCARD too. Fixes: b3bf915308ca ("mmc: core: new discard feature support at eMMC v4.5") Signed-off-by: Christian Loehle Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/11376b5714964345908f3990f17e0701@hyperstone.com Signed-off-by: Ulf Hansson --- include/linux/mmc/mmc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 9c50bc40f8ff..6f7993803ee7 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -451,7 +451,7 @@ static inline bool mmc_ready_for_data(u32 status) #define MMC_SECURE_TRIM1_ARG 0x80000001 #define MMC_SECURE_TRIM2_ARG 0x80008000 #define MMC_SECURE_ARGS 0x80000000 -#define MMC_TRIM_ARGS 0x00008001 +#define MMC_TRIM_OR_DISCARD_ARGS 0x00008003 #define mmc_driver_type_mask(n) (1 << (n)) -- cgit From 9705bc70960459ae09f4a5e77283973bb3a40f57 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2022 17:05:17 +0000 Subject: ftrace: pass fregs to arch_ftrace_set_direct_caller() In subsequent patches we'll arrange for architectures to have an ftrace_regs which is entirely distinct from pt_regs. In preparation for this, we need to minimize the use of pt_regs to where strictly necessary in the core ftrace code. This patch changes the prototype of arch_ftrace_set_direct_caller() to take ftrace_regs rather than pt_regs, and moves the extraction of the pt_regs into arch_ftrace_set_direct_caller(). On x86, arch_ftrace_set_direct_caller() can be used even when CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=n, and defines struct ftrace_regs. Due to this, it's necessary to define arch_ftrace_set_direct_caller() as a macro to avoid using an incomplete type. I've also moved the body of arch_ftrace_set_direct_caller() after the CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=y defineidion of struct ftrace_regs. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Florent Revest Cc: Masami Hiramatsu Cc: Steven Rostedt Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20221103170520.931305-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- include/linux/ftrace.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 62557d4bffc2..f201fcbfffb0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -37,9 +37,10 @@ extern void ftrace_boot_snapshot(void); static inline void ftrace_boot_snapshot(void) { } #endif -#ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops; struct ftrace_regs; + +#ifdef CONFIG_FUNCTION_TRACER /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that @@ -427,9 +428,7 @@ static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsi { return -ENODEV; } -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called @@ -443,9 +442,9 @@ static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsi * the return from the trampoline jump to the direct caller * instead of going back to the function it just traced. */ -static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { } -#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifdef CONFIG_STACK_TRACER -- cgit From 0ef86097f127d0d73e19aa4dcf86106105e7db09 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2022 17:05:18 +0000 Subject: ftrace: rename ftrace_instruction_pointer_set() -> ftrace_regs_set_instruction_pointer() In subsequent patches we'll add a sew of ftrace_regs_{get,set}_*() helpers. In preparation, this patch renames ftrace_instruction_pointer_set() to ftrace_regs_set_instruction_pointer(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Florent Revest Cc: Masami Hiramatsu Cc: Steven Rostedt Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20221103170520.931305-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- include/linux/ftrace.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f201fcbfffb0..ec3657a60f85 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -111,12 +111,11 @@ struct ftrace_regs { #define arch_ftrace_get_regs(fregs) (&(fregs)->regs) /* - * ftrace_instruction_pointer_set() is to be defined by the architecture - * if to allow setting of the instruction pointer from the ftrace_regs - * when HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports - * live kernel patching. + * ftrace_regs_set_instruction_pointer() is to be defined by the architecture + * if to allow setting of the instruction pointer from the ftrace_regs when + * HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports live kernel patching. */ -#define ftrace_instruction_pointer_set(fregs, ip) do { } while (0) +#define ftrace_regs_set_instruction_pointer(fregs, ip) do { } while (0) #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) -- cgit From 94d095ffa0e16bb7f161a2b73bbe5c2795d499a8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2022 17:05:19 +0000 Subject: ftrace: abstract DYNAMIC_FTRACE_WITH_ARGS accesses In subsequent patches we'll arrange for architectures to have an ftrace_regs which is entirely distinct from pt_regs. In preparation for this, we need to minimize the use of pt_regs to where strictly necessary in the core ftrace code. This patch adds new ftrace_regs_{get,set}_*() helpers which can be used to manipulate ftrace_regs. When CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=y, these can always be used on any ftrace_regs, and when CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=n these can be used when regs are available. A new ftrace_regs_has_args(fregs) helper is added which code can use to check when these are usable. Co-developed-by: Florent Revest Signed-off-by: Florent Revest Signed-off-by: Mark Rutland Cc: Masami Hiramatsu Cc: Steven Rostedt Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20221103170520.931305-4-mark.rutland@arm.com Signed-off-by: Will Deacon --- include/linux/ftrace.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ec3657a60f85..99f1146614c0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -126,6 +126,35 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs return arch_ftrace_get_regs(fregs); } +/* + * When true, the ftrace_regs_{get,set}_*() functions may be used on fregs. + * Note: this can be true even when ftrace_get_regs() cannot provide a pt_regs. + */ +static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs) +{ + if (IS_ENABLED(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)) + return true; + + return ftrace_get_regs(fregs) != NULL; +} + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#define ftrace_regs_get_instruction_pointer(fregs) \ + instruction_pointer(ftrace_get_regs(fregs)) +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(ftrace_get_regs(fregs), n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(ftrace_get_regs(fregs)) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(ftrace_get_regs(fregs)) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(ftrace_get_regs(fregs), ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(ftrace_get_regs(fregs)) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) +#endif + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); -- cgit From 27b9ecc7a9ba1d0014779bfe5a6dbf630899c6e7 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 15 Nov 2022 07:36:01 +0000 Subject: regulator: Add of_regulator_bulk_get_all It work exactly like regulator_bulk_get() but instead of working on a provided list of names, it seek all consumers properties matching xxx-supply. Signed-off-by: Corentin Labbe Link: https://lore.kernel.org/r/20221115073603.3425396-2-clabbe@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index ee3b4a014611..276488dad61d 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -244,6 +244,8 @@ int regulator_disable_deferred(struct regulator *regulator, int ms); int __must_check regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); @@ -479,6 +481,12 @@ static inline int devm_regulator_bulk_get(struct device *dev, int num_consumers, return 0; } +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + static inline int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { -- cgit From 6c7b2202e4d11572ab23a89aeec49005b94bb966 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Nov 2022 12:25:02 -0500 Subject: KVM: x86: avoid memslot check in NX hugepage recovery if it cannot succeed Since gfn_to_memslot() is relatively expensive, it helps to skip it if it the memslot cannot possibly have dirty logging enabled. In order to do this, add to struct kvm a counter of the number of log-page memslots. While the correct value can only be read with slots_lock taken, the NX recovery thread is content with using an approximate value. Therefore, the counter is an atomic_t. Based on https://lore.kernel.org/kvm/20221027200316.2221027-2-dmatlack@google.com/ by David Matlack. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e6e66c5e56f2..6f0f389f5f9c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -722,6 +722,11 @@ struct kvm { /* The current active memslot set for each address space */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct xarray vcpu_array; + /* + * Protected by slots_lock, but can be read outside if an + * incorrect answer is acceptable. + */ + atomic_t nr_memslots_dirty_logging; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; -- cgit From 74c8e6bffbe10c4470139496f930c0b0752c85c9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 29 Oct 2022 00:47:34 -0700 Subject: driver core: Add __alloc_size hint to devm allocators Mark the devm_*alloc()-family of allocations with appropriate __alloc_size()/__realloc_size() hints so the compiler can attempt to reason about buffer lengths from allocations. Cc: Greg Kroah-Hartman Cc: Rasmus Villemoes Cc: Thomas Gleixner Cc: Jason Gunthorpe Cc: Nishanth Menon Cc: Michael Kelley Cc: Dan Williams Cc: Won Chung Signed-off-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221029074734.gonna.276-kees@kernel.org --- include/linux/device.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 424b55df0272..5e4cd857e74f 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -197,9 +197,9 @@ void devres_remove_group(struct device *dev, void *id); int devres_release_group(struct device *dev, void *id); /* managed devm_k.alloc/kfree for device drivers */ -void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc; +void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2); void *devm_krealloc(struct device *dev, void *ptr, size_t size, - gfp_t gfp) __must_check; + gfp_t gfp) __must_check __realloc_size(3); __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) __malloc; __printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp, @@ -226,7 +226,8 @@ static inline void *devm_kcalloc(struct device *dev, void devm_kfree(struct device *dev, const void *p); char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp); -void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) + __realloc_size(3); unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); -- cgit From 8603b6f58637ce196d68f7749633ea81af196d66 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 3 Sep 2022 08:43:30 +0200 Subject: core_pattern: add CPU specifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Statistically, in a large deployment regular segfaults may indicate a CPU issue. Currently, it is not possible to find out what CPU the segfault happened on. There are at least two attempts to improve segfault logging with this regard, but they do not help in case the logs rotate. Hence, lets make sure it is possible to permanently record a CPU the task ran on using a new core_pattern specifier. Link: https://lkml.kernel.org/r/20220903064330.20772-1-oleksandr@redhat.com Signed-off-by: Oleksandr Natalenko Suggested-by: Renaud Métrich Reviewed-by: Oleg Nesterov Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Grzegorz Halat Cc: "Guilherme G. Piccoli" Cc: "Huang, Ying" Cc: Jason A. Donenfeld Cc: Joel Savitz Cc: Jonathan Corbet Cc: Kees Cook Cc: Laurent Dufour Cc: Luis Chamberlain Cc: Rob Herring Cc: Stephen Kitt Cc: Will Deacon Cc: Xiaoming Ni Signed-off-by: Andrew Morton --- include/linux/coredump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 08a1d3e7e46d..191dcf5af6cb 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -22,6 +22,7 @@ struct coredump_params { struct file *file; unsigned long limit; unsigned long mm_flags; + int cpu; loff_t written; loff_t pos; loff_t to_skip; -- cgit From cade589fdf697dd4981056c09f83924db8e4e4ed Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 29 Sep 2022 12:29:35 +0800 Subject: kexec: replace crash_mem_range with range We already have struct range, so just use it. Link: https://lkml.kernel.org/r/20220929042936.22012-4-bhe@redhat.com Signed-off-by: Li Chen Signed-off-by: Baoquan He Acked-by: Baoquan He Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chen Lifu Cc: "Eric W . Biederman" Cc: Jianglei Nie Cc: Petr Mladek Cc: Russell King Cc: ye xingchen Cc: Zeal Robot Signed-off-by: Andrew Morton --- include/linux/kexec.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 41a686996aaa..5dd4343c1bbe 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -240,14 +241,10 @@ static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 -struct crash_mem_range { - u64 start, end; -}; - struct crash_mem { unsigned int max_nr_ranges; unsigned int nr_ranges; - struct crash_mem_range ranges[]; + struct range ranges[]; }; extern int crash_exclude_mem_range(struct crash_mem *mem, -- cgit From 5efcecd9a3b18078d3398b359a84c83f549e22cf Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 26 Sep 2022 15:34:34 +0200 Subject: minmax: sanity check constant bounds when clamping The clamp family of functions only makes sense if hi>=lo. If hi and lo are compile-time constants, then raise a build error. Doing so has already caught buggy code. This also introduces the infrastructure to improve the clamping function in subsequent commits. [akpm@linux-foundation.org: coding-style cleanups] [akpm@linux-foundation.org: s@&&\@&& \@] Link: https://lkml.kernel.org/r/20220926133435.1333846-1-Jason@zx2c4.com Signed-off-by: Jason A. Donenfeld Reviewed-by: Andy Shevchenko Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/minmax.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 5433c08fcc68..bb1ad381c129 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -37,6 +37,28 @@ __cmp(x, y, op), \ __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) +#define __clamp(val, lo, hi) \ + __cmp(__cmp(val, lo, >), hi, <) + +#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ + typeof(val) unique_val = (val); \ + typeof(lo) unique_lo = (lo); \ + typeof(hi) unique_hi = (hi); \ + __clamp(unique_val, unique_lo, unique_hi); }) + +#define __clamp_input_check(lo, hi) \ + (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ + __is_constexpr((lo) > (hi)), (lo) > (hi), false))) + +#define __careful_clamp(val, lo, hi) ({ \ + __clamp_input_check(lo, hi) + \ + __builtin_choose_expr(__typecheck(val, lo) && __typecheck(val, hi) && \ + __typecheck(hi, lo) && __is_constexpr(val) && \ + __is_constexpr(lo) && __is_constexpr(hi), \ + __clamp(val, lo, hi), \ + __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \ + __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); }) + /** * min - return minimum of two values of the same or compatible types * @x: first value @@ -86,7 +108,7 @@ * This macro does strict typechecking of @lo/@hi to make sure they are of the * same type as @val. See the unnecessary pointer comparisons. */ -#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) +#define clamp(val, lo, hi) __careful_clamp(val, lo, hi) /* * ..and if you can't take the strict @@ -121,7 +143,7 @@ * This macro does no typechecking and uses temporary variables of type * @type to make all the comparisons. */ -#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) +#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) /** * clamp_val - return a value clamped to a given range using val's type -- cgit From 2122e2a4efc2cd139474079e11939b6e07adfacd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 26 Sep 2022 15:34:35 +0200 Subject: minmax: clamp more efficiently by avoiding extra comparison Currently the clamp algorithm does: if (val > hi) val = hi; if (val < lo) val = lo; But since hi > lo by definition, this can be made more efficient with: if (val > hi) val = hi; else if (val < lo) val = lo; So fix up the clamp and clamp_t functions to do this, adding the same argument checking as for min and min_t. For simple cases, code generation on x86_64 and aarch64 stay about the same: before: cmp edi, edx mov eax, esi cmova edi, edx cmp edi, esi cmovnb eax, edi ret after: cmp edi, esi mov eax, edx cmovnb esi, edi cmp edi, edx cmovb eax, esi ret before: cmp w0, w2 csel w8, w0, w2, lo cmp w8, w1 csel w0, w8, w1, hi ret after: cmp w0, w1 csel w8, w0, w1, hi cmp w0, w2 csel w0, w8, w2, lo ret On MIPS64, however, code generation improves, by removing arithmetic in the second branch: before: sltu $3,$6,$4 bne $3,$0,.L2 move $2,$6 move $2,$4 .L2: sltu $3,$2,$5 bnel $3,$0,.L7 move $2,$5 .L7: jr $31 nop after: sltu $3,$4,$6 beq $3,$0,.L13 move $2,$6 sltu $3,$4,$5 bne $3,$0,.L12 move $2,$4 .L13: jr $31 nop .L12: jr $31 move $2,$5 For more complex cases with surrounding code, the effects are a bit more complicated. For example, consider this simplified version of timestamp_truncate() from fs/inode.c on x86_64: struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned int gran = sb->s_time_gran; t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); if (t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min) t.tv_nsec = 0; return t; } before: mov r8, rdx mov rdx, rsi mov rcx, QWORD PTR [r8] mov rax, QWORD PTR [rcx+8] mov rcx, QWORD PTR [rcx+16] cmp rax, rdi mov r8, rcx cmovge rdi, rax cmp rdi, rcx cmovle r8, rdi cmp rax, r8 je .L4 cmp rdi, rcx jge .L4 mov rax, r8 ret .L4: xor edx, edx mov rax, r8 ret after: mov rax, QWORD PTR [rdx] mov rdx, QWORD PTR [rax+8] mov rax, QWORD PTR [rax+16] cmp rax, rdi jg .L6 mov r8, rax xor edx, edx .L2: mov rax, r8 ret .L6: cmp rdx, rdi mov r8, rdi cmovge r8, rdx cmp rax, r8 je .L4 xor eax, eax cmp rdx, rdi cmovl rax, rsi mov rdx, rax mov rax, r8 ret .L4: xor edx, edx jmp .L2 In this case, we actually gain a branch, unfortunately, because the compiler's replacement axioms no longer as cleanly apply. So all and all, this change is a bit of a mixed bag. Link: https://lkml.kernel.org/r/20220926133435.1333846-2-Jason@zx2c4.com Signed-off-by: Jason A. Donenfeld Cc: Andy Shevchenko Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/minmax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index bb1ad381c129..396df1121bff 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -38,7 +38,7 @@ __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) #define __clamp(val, lo, hi) \ - __cmp(__cmp(val, lo, >), hi, <) + ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) #define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ typeof(val) unique_val = (val); \ -- cgit From 941baf6febaa88c057192084ca280d976c7c7239 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 8 Sep 2022 21:21:54 +0300 Subject: proc: give /proc/cmdline size Most /proc files don't have length (in fstat sense). This leads to inefficiencies when reading such files with APIs commonly found in modern programming languages. They open file, then fstat descriptor, get st_size == 0 and either assume file is empty or start reading without knowing target size. cat(1) does OK because it uses large enough buffer by default. But naive programs copy-pasted from SO aren't: let mut f = std::fs::File::open("/proc/cmdline").unwrap(); let mut buf: Vec = Vec::new(); f.read_to_end(&mut buf).unwrap(); will result in openat(AT_FDCWD, "/proc/cmdline", O_RDONLY|O_CLOEXEC) = 3 statx(0, NULL, AT_STATX_SYNC_AS_STAT, STATX_ALL, NULL) = -1 EFAULT (Bad address) statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_BASIC_STATS|STATX_MNT_ID, stx_attributes=0, stx_mode=S_IFREG|0444, stx_size=0, ...}) = 0 lseek(3, 0, SEEK_CUR) = 0 read(3, "BOOT_IMAGE=(hd3,gpt2)/vmlinuz-5.", 32) = 32 read(3, "19.6-100.fc35.x86_64 root=/dev/m", 32) = 32 read(3, "apper/fedora_localhost--live-roo"..., 64) = 64 read(3, "ocalhost--live-swap rd.lvm.lv=fe"..., 128) = 116 read(3, "", 12) open/stat is OK, lseek looks silly but there are 3 unnecessary reads because Rust starts with 32 bytes per Vec and grows from there. In case of /proc/cmdline, the length is known precisely. Make variables readonly while I'm at it. P.S.: I tried to scp /proc/cpuinfo today and got empty file but this is separate story. Link: https://lkml.kernel.org/r/YxoywlbM73JJN3r+@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- include/linux/init.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 077d7f93b402..2e96756fe1ff 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -143,6 +143,7 @@ struct file_system_type; extern int do_one_initcall(initcall_t fn); extern char __initdata boot_command_line[]; extern char *saved_command_line; +extern unsigned int saved_command_line_len; extern unsigned int reset_devices; /* used by init/main.c */ -- cgit From f6fbd8cbf3ed1915a7b957f2801f7c306a686c08 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 9 Nov 2022 14:14:35 -0500 Subject: lsm,fs: fix vfs_getxattr_alloc() return type and caller error paths The vfs_getxattr_alloc() function currently returns a ssize_t value despite the fact that it only uses int values internally for return values. Fix this by converting vfs_getxattr_alloc() to return an int type and adjust the callers as necessary. As part of these caller modifications, some of the callers are fixed to properly free the xattr value buffer on both success and failure to ensure that memory is not leaked in the failure case. Reviewed-by: Serge Hallyn Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/xattr.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 4c379d23ec6e..2218a9645b89 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -68,9 +68,9 @@ int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, int vfs_removexattr(struct user_namespace *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -ssize_t vfs_getxattr_alloc(struct user_namespace *mnt_userns, - struct dentry *dentry, const char *name, - char **xattr_value, size_t size, gfp_t flags); +int vfs_getxattr_alloc(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *name, + char **xattr_value, size_t size, gfp_t flags); int xattr_supported_namespace(struct inode *inode, const char *prefix); -- cgit From b169a180bef26679b44484ad24b7d8ae32623a10 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 15 Nov 2022 15:26:43 +0000 Subject: iommu/io-pgtable: Remove map/unmap With all users now calling {map,unmap}_pages, retire the redundant single-page callbacks. Signed-off-by: Robin Murphy Acked-by: Will Deacon Link: https://lore.kernel.org/r/a5a3cbf95c3279982e378cc43dad830322a59868.1668100209.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 1f068dfdb140..1b7a44b35616 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -150,9 +150,7 @@ struct io_pgtable_cfg { /** * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * - * @map: Map a physically contiguous memory region. * @map_pages: Map a physically contiguous range of pages of the same size. - * @unmap: Unmap a physically contiguous memory region. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. * @@ -160,13 +158,9 @@ struct io_pgtable_cfg { * the same names. */ struct io_pgtable_ops { - int (*map)(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp); int (*map_pages)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped); - size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather); size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova, size_t pgsize, size_t pgcount, struct iommu_iotlb_gather *gather); -- cgit From 47c008050aec3e9a13af29dd74cd8b4c112bc07b Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Fri, 28 Oct 2022 09:50:19 +0200 Subject: watchdog: rn5t618: add support for read out bootstatus The PMIC does store the power-off factor internally. Read it out and report it as bootstatus. Signed-off-by: Marcus Folkesson Acked-by: Lee Jones Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20221028075019.2757812-1-marcus.folkesson@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/mfd/rn5t618.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/rn5t618.h b/include/linux/mfd/rn5t618.h index 8aa0bda1af4f..aacb6d51e99c 100644 --- a/include/linux/mfd/rn5t618.h +++ b/include/linux/mfd/rn5t618.h @@ -227,6 +227,15 @@ #define RN5T618_WATCHDOG_WDOGTIM_S 0 #define RN5T618_PWRIRQ_IR_WDOG BIT(6) +#define RN5T618_POFFHIS_PWRON BIT(0) +#define RN5T618_POFFHIS_TSHUT BIT(1) +#define RN5T618_POFFHIS_VINDET BIT(2) +#define RN5T618_POFFHIS_IODET BIT(3) +#define RN5T618_POFFHIS_CPU BIT(4) +#define RN5T618_POFFHIS_WDG BIT(5) +#define RN5T618_POFFHIS_DCLIM BIT(6) +#define RN5T618_POFFHIS_N_OE BIT(7) + enum { RN5T618_DCDC1, RN5T618_DCDC2, -- cgit From 5fdded8448924e3631d466eea499b11606c43640 Mon Sep 17 00:00:00 2001 From: Kant Fan Date: Tue, 25 Oct 2022 15:21:09 +0800 Subject: PM/devfreq: governor: Add a private governor_data for governor The member void *data in the structure devfreq can be overwrite by governor_userspace. For example: 1. The device driver assigned the devfreq governor to simple_ondemand by the function devfreq_add_device() and init the devfreq member void *data to a pointer of a static structure devfreq_simple_ondemand_data by the function devfreq_add_device(). 2. The user changed the devfreq governor to userspace by the command "echo userspace > /sys/class/devfreq/.../governor". 3. The governor userspace alloced a dynamic memory for the struct userspace_data and assigend the member void *data of devfreq to this memory by the function userspace_init(). 4. The user changed the devfreq governor back to simple_ondemand by the command "echo simple_ondemand > /sys/class/devfreq/.../governor". 5. The governor userspace exited and assigned the member void *data in the structure devfreq to NULL by the function userspace_exit(). 6. The governor simple_ondemand fetched the static information of devfreq_simple_ondemand_data in the function devfreq_simple_ondemand_func() but the member void *data of devfreq was assigned to NULL by the function userspace_exit(). 7. The information of upthreshold and downdifferential is lost and the governor simple_ondemand can't work correctly. The member void *data in the structure devfreq is designed for a static pointer used in a governor and inited by the function devfreq_add_device(). This patch add an element named governor_data in the devfreq structure which can be used by a governor(E.g userspace) who want to assign a private data to do some private things. Fixes: ce26c5bb9569 ("PM / devfreq: Add basic governors") Cc: stable@vger.kernel.org # 5.10+ Reviewed-by: Chanwoo Choi Acked-by: MyungJoo Ham Signed-off-by: Kant Fan Signed-off-by: Chanwoo Choi --- include/linux/devfreq.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 34aab4dd336c..4dc7cda4fd46 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -152,8 +152,8 @@ struct devfreq_stats { * @max_state: count of entry present in the frequency table. * @previous_freq: previously configured frequency value. * @last_status: devfreq user device info, performance statistics - * @data: Private data of the governor. The devfreq framework does not - * touch this. + * @data: devfreq driver pass to governors, governor should not change it. + * @governor_data: private data for governors, devfreq core doesn't touch it. * @user_min_freq_req: PM QoS minimum frequency request from user (via sysfs) * @user_max_freq_req: PM QoS maximum frequency request from user (via sysfs) * @scaling_min_freq: Limit minimum frequency requested by OPP interface @@ -193,7 +193,8 @@ struct devfreq { unsigned long previous_freq; struct devfreq_dev_status last_status; - void *data; /* private data for governors */ + void *data; + void *governor_data; struct dev_pm_qos_request user_min_freq_req; struct dev_pm_qos_request user_max_freq_req; -- cgit From ef66c5475d7fb864c2418d3bdd19dee46324624b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:01 -0600 Subject: bpf: Allow multiple modifiers in reg_type_str() prefix reg_type_str() in the verifier currently only allows a single register type modifier to be present in the 'prefix' string which is eventually stored in the env type_str_buf. This currently works fine because there are no overlapping type modifiers, but once PTR_TRUSTED is added, that will no longer be the case. This patch updates reg_type_str() to support having multiple modifiers in the prefix string, and updates the size of type_str_buf to be 128 bytes. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 23f30c685f28..608dde740fef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -19,7 +19,7 @@ */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of type_str_buf in bpf_verifier. */ -#define TYPE_STR_BUF_LEN 64 +#define TYPE_STR_BUF_LEN 128 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that -- cgit From 3f00c52393445ed49aadc1a567aa502c6333b1a1 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:02 -0600 Subject: bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal to the verifier that it should enforce that a BPF program passes it a "safe", trusted pointer. Currently, "safe" means that the pointer is either PTR_TO_CTX, or is refcounted. There may be cases, however, where the kernel passes a BPF program a safe / trusted pointer to an object that the BPF program wishes to use as a kptr, but because the object does not yet have a ref_obj_id from the perspective of the verifier, the program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS kfunc. The solution is to expand the set of pointers that are considered trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs with these pointers without getting rejected by the verifier. There is already a PTR_UNTRUSTED flag that is set in some scenarios, such as when a BPF program reads a kptr directly from a map without performing a bpf_kptr_xchg() call. These pointers of course can and should be rejected by the verifier. Unfortunately, however, PTR_UNTRUSTED does not cover all the cases for safety that need to be addressed to adequately protect kfuncs. Specifically, pointers obtained by a BPF program "walking" a struct are _not_ considered PTR_UNTRUSTED according to BPF. For example, say that we were to add a kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal that a task was unsafe to pass to a kfunc, the verifier would mistakenly allow the following unsafe BPF program to be loaded: SEC("tp_btf/task_newtask") int BPF_PROG(unsafe_acquire_task, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *nested; nested = task->last_wakee; /* Would not be rejected by the verifier. */ acquired = bpf_task_acquire(nested); if (!acquired) return 0; bpf_task_release(acquired); return 0; } To address this, this patch defines a new type flag called PTR_TRUSTED which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are passed directly from the kernel as a tracepoint or struct_ops callback argument. Any nested pointer that is obtained from walking a PTR_TRUSTED pointer is no longer PTR_TRUSTED. From the example above, the struct task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer obtained from 'task->last_wakee' is not PTR_TRUSTED. A subsequent patch will add kfuncs for storing a task kfunc as a kptr, and then another patch will add selftests to validate. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 ++++++++++++++++++++ include/linux/bpf_verifier.h | 7 +++++ include/linux/btf.h | 65 ++++++++++++++++++++++++++++---------------- 3 files changed, 78 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b32376ce746..c9eafa67f2a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -543,6 +543,35 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + /* PTR was passed from the kernel in a trusted context, and may be + * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. + * PTR_UNTRUSTED refers to a kptr that was read directly from a map + * without invoking bpf_kptr_xchg(). What we really need to know is + * whether a pointer is safe to pass to a kfunc or BPF helper function. + * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF + * helpers, they do not cover all possible instances of unsafe + * pointers. For example, a pointer that was obtained from walking a + * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the + * fact that it may be NULL, invalid, etc. This is due to backwards + * compatibility requirements, as this was the behavior that was first + * introduced when kptrs were added. The behavior is now considered + * deprecated, and PTR_UNTRUSTED will eventually be removed. + * + * PTR_TRUSTED, on the other hand, is a pointer that the kernel + * guarantees to be valid and safe to pass to kfuncs and BPF helpers. + * For example, pointers passed to tracepoint arguments are considered + * PTR_TRUSTED, as are pointers that are passed to struct_ops + * callbacks. As alluded to above, pointers that are obtained from + * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a + * struct task_struct *task is PTR_TRUSTED, then accessing + * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored + * in a BPF register. Similarly, pointers passed to certain programs + * types such as kretprobes are not guaranteed to be valid, as they may + * for example contain an object that was recently freed. + */ + PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -636,6 +665,7 @@ enum bpf_return_type { RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 608dde740fef..545152ac136c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,4 +680,11 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) + +static inline bool bpf_type_has_unsafe_modifiers(u32 type) +{ + return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index d5b26380a60f..d38aa4251c28 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -19,36 +19,53 @@ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ #define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ -/* Trusted arguments are those which are meant to be referenced arguments with - * unchanged offset. It is used to enforce that pointers obtained from acquire - * kfuncs remain unmodified when being passed to helpers taking trusted args. +/* Trusted arguments are those which are guaranteed to be valid when passed to + * the kfunc. It is used to enforce that pointers obtained from either acquire + * kfuncs, or from the main kernel on a tracepoint or struct_ops callback + * invocation, remain unmodified when being passed to helpers taking trusted + * args. * - * Consider - * struct foo { - * int data; - * struct foo *next; - * }; + * Consider, for example, the following new task tracepoint: * - * struct bar { - * int data; - * struct foo f; - * }; + * SEC("tp_btf/task_newtask") + * int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags) + * { + * ... + * } * - * struct foo *f = alloc_foo(); // Acquire kfunc - * struct bar *b = alloc_bar(); // Acquire kfunc + * And the following kfunc: * - * If a kfunc set_foo_data() wants to operate only on the allocated object, it - * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like: + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) * - * set_foo_data(f, 42); // Allowed - * set_foo_data(f->next, 42); // Rejected, non-referenced pointer - * set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type - * set_foo_data(&b->f, 42); // Rejected, referenced, but bad offset + * All invocations to the kfunc must pass the unmodified, unwalked task: * - * In the final case, usually for the purposes of type matching, it is deduced - * by looking at the type of the member at the offset, but due to the - * requirement of trusted argument, this deduction will be strict and not done - * for this case. + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(task->last_wakee); // Rejected, walked task + * + * Programs may also pass referenced tasks directly to the kfunc: + * + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(task); // Allowed, same as above + * bpf_task_acquire(acquired); // Allowed + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(acquired->last_wakee); // Rejected, walked task + * + * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or + * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these + * pointers are guaranteed to be safe. For example, the following BPF program + * would be rejected: + * + * SEC("kretprobe/free_task") + * int BPF_PROG(free_task_probe, struct task_struct *tsk) + * { + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer + * bpf_task_release(acquired); + * + * return 0; + * } */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ -- cgit From fd264ca020948a743e4c36731dfdecc4a812153c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 20 Nov 2022 11:54:32 -0800 Subject: bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx Implement bpf_cast_to_kern_ctx() kfunc which does a type cast of a uapi ctx object to the corresponding kernel ctx. Previously if users want to access some data available in kctx but not in uapi ctx, bpf_probe_read_kernel() helper is needed. The introduction of bpf_cast_to_kern_ctx() allows direct memory access which makes code simpler and easier to understand. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221120195432.3113982-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index d38aa4251c28..9ed00077db6e 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -487,6 +487,7 @@ const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); +int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); #else @@ -531,6 +532,10 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, { return NULL; } +static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log, + enum bpf_prog_type prog_type) { + return -EINVAL; +} static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2) { -- cgit From 0dff89c4488f90c01807d9c12023433703206523 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 8 Sep 2022 20:07:14 +0800 Subject: sched: Move numa_balancing sysctls to its own file The sysctl_numa_balancing_promote_rate_limit and sysctl_numa_balancing are part of sched, move them to its own file. Signed-off-by: Kefeng Wang Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 303ee7dd0c7e..5a64582b086b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -4,8 +4,6 @@ #include -struct ctl_table; - #ifdef CONFIG_DETECT_HUNG_TASK /* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; @@ -27,12 +25,8 @@ enum sched_tunable_scaling { #ifdef CONFIG_NUMA_BALANCING extern int sysctl_numa_balancing_mode; -extern unsigned int sysctl_numa_balancing_promote_rate_limit; #else #define sysctl_numa_balancing_mode 0 #endif -int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - #endif /* _LINUX_SCHED_SYSCTL_H */ -- cgit From e8753e416c7ec39812cf92608aa95640caca70fa Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 13 Nov 2022 18:08:27 +0800 Subject: percpu: adjust the value of PERCPU_DYNAMIC_EARLY_SIZE LKP reported a build failure as below on the following patch "mm/slub, percpu: correct the calculation of early percpu allocation size" ~~~~~~ In file included from : In function 'alloc_kmem_cache_cpus', inlined from 'kmem_cache_open' at mm/slub.c:4340:6: >> >> include/linux/compiler_types.h:357:45: error: call to '__compiletime_assert_474' declared with attribute error: BUILD_BUG_ON failed: PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu) 357 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) ~~~~~~ From the kernel config file provided by LKP, the building was made on arm64 with below Kconfig item enabled: CONFIG_ZONE_DMA=y CONFIG_SLUB_CPU_PARTIAL=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_SLUB_STATS=y CONFIG_ARM64_PAGE_SHIFT=16 CONFIG_ARM64_64K_PAGES=y Then we will have: NR_KMALLOC_TYPES:4 KMALLOC_SHIFT_HIGH:17 sizeof(struct kmem_cache_cpu):184 The product of them is 12512, which is bigger than PERCPU_DYNAMIC_EARLY_SIZE, 12K. Hence, the BUILD_BUG_ON in alloc_kmem_cache_cpus() is triggered. Earlier, in commit 099a19d91ca4 ("percpu: allow limited allocation before slab is online"), PERCPU_DYNAMIC_EARLY_SIZE was introduced and set to 12K which is equal to the then PERPCU_DYNAMIC_RESERVE. Later, in commit 1a4d76076cda ("percpu: implement asynchronous chunk population"), PERPCU_DYNAMIC_RESERVE was increased by 8K, while PERCPU_DYNAMIC_EARLY_SIZE was kept unchanged. So, here increase PERCPU_DYNAMIC_EARLY_SIZE by 8K too to accommodate to the slub's requirement. Reported-by: kernel test robot Signed-off-by: Baoquan He Acked-by: Dennis Zhou Signed-off-by: Vlastimil Babka --- include/linux/percpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index f1ec5ad1351c..3dbb6fb70658 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -42,7 +42,7 @@ * larger than PERCPU_DYNAMIC_EARLY_SIZE. */ #define PERCPU_DYNAMIC_EARLY_SLOTS 128 -#define PERCPU_DYNAMIC_EARLY_SIZE (12 << 10) +#define PERCPU_DYNAMIC_EARLY_SIZE (20 << 10) /* * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy -- cgit From 838de63b101147fc7d8af828465cf6d1d30232a8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 10 Nov 2022 09:10:30 +0100 Subject: mm/slab: move and adjust kernel-doc for kmem_cache_alloc Alexander reports an issue with the kmem_cache_alloc() comment in mm/slab.c: > The current comment mentioned that the flags only matters if the > cache has no available objects. It's different for the __GFP_ZERO > flag which will ensure that the returned object is always zeroed > in any case. > I have the feeling I run into this question already two times if > the user need to zero the object or not, but the user does not need > to zero the object afterwards. However another use of __GFP_ZERO > and only zero the object if the cache has no available objects would > also make no sense. and suggests thus mentioning __GFP_ZERO as the exception. But on closer inspection, the part about flags being only relevant if cache has no available objects is misleading. The slab user has no reliable way to determine if there are available objects, and e.g. the might_sleep() debug check can be performed even if objects are available, so passing correct flags given the allocation context always matters. Thus remove that sentence completely, and while at it, move the comment to from SLAB-specific mm/slab.c to the common include/linux/slab.h The comment otherwise refers flags description for kmalloc(), so add __GFP_ZERO comment there and remove a very misleading GFP_HIGHUSER (not applicable to slab) description from there. Mention kzalloc() and kmem_cache_zalloc() shortcuts. Reported-by: Alexander Aring Link: https://lore.kernel.org/all/20221011145413.8025-1-aahringo@redhat.com/ Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 90877fcde70b..1bf631f29cd2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -441,7 +441,18 @@ static_assert(PAGE_SHIFT <= 20); #endif /* !CONFIG_SLOB */ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. + * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags. + * + * Return: pointer to the new object or %NULL in case of error + */ +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *s, void *objp); @@ -506,9 +517,9 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align __alloc_size(1); /** - * kmalloc - allocate memory + * kmalloc - allocate kernel memory * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. + * @flags: describe the allocation context * * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. @@ -535,12 +546,12 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align * %GFP_ATOMIC * Allocation will not sleep. May use emergency pools. * - * %GFP_HIGHUSER - * Allocate memory from high memory on behalf of user. - * * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * + * %__GFP_ZERO + * Zero the allocated memory before returning. Also see kzalloc(). + * * %__GFP_HIGH * This allocation has high priority and may use emergency pools. * -- cgit From 3bf019334fbbb51723587f9819c62d3d6f9d6f3f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 19:51:58 -0800 Subject: slab: Clean up SLOB vs kmalloc() definition As already done for kmalloc_node(), clean up the #ifdef usage in the definition of kmalloc() so that the SLOB-only version is an entirely separate and much more readable function. Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: linux-mm@kvack.org Signed-off-by: Kees Cook Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 45efc6c553b8..8a25d1b2e420 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -536,15 +536,15 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align * Try really hard to succeed the allocation but fail * eventually. */ +#ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { -#ifndef CONFIG_SLOB unsigned int index; -#endif + if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); -#ifndef CONFIG_SLOB + index = kmalloc_index(size); if (!index) @@ -553,10 +553,18 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); -#endif } return __kmalloc(size, flags); } +#else +static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) +{ + if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large(size, flags); + + return __kmalloc(size, flags); +} +#endif #ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) -- cgit From 6fa57d78aa7f212fd7c0de70f5756e18513dcdcf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 19:51:59 -0800 Subject: slab: Remove special-casing of const 0 size allocations Passing a constant-0 size allocation into kmalloc() or kmalloc_node() does not need to be a fast-path operation, so the static return value can be removed entirely. This makes sure that all paths through the inlines result in a full extern function call, where __alloc_size() hints will actually be seen[1] by GCC. (A constant return value of 0 means the "0" allocation size won't be propagated by the inline.) [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503 Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: linux-mm@kvack.org Signed-off-by: Kees Cook Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 8a25d1b2e420..134cc05e5745 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -539,17 +539,13 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align #ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) { - if (__builtin_constant_p(size)) { + if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); index = kmalloc_index(size); - - if (!index) - return ZERO_SIZE_PTR; - return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); @@ -569,17 +565,13 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) #ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { - if (__builtin_constant_p(size)) { + if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large_node(size, flags, node); index = kmalloc_index(size); - - if (!index) - return ZERO_SIZE_PTR; - return kmalloc_node_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, node, size); -- cgit From 28ef7670414e309d8bbee41f9389b7e21a58572c Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 11 Aug 2022 00:06:50 +0200 Subject: mtd: spi-nor: remember full JEDEC flash ID At the moment, we print the JEDEC ID that is stored in our database. The generic flash support won't have such an entry in our database. To find out the JEDEC ID later we will have to cache it. There is also another advantage: If the flash is found in the database, the ID could be truncated because the ID of the entry is used which can be shorter. Some flashes still holds valuable information in the bytes after the JEDEC ID and come in handy during debugging of when coping with INFO6() entries. These are not accessible for now. Save a copy of the ID bytes after reading and display it via debugfs. Signed-off-by: Michael Walle Signed-off-by: Tudor Ambarus Reviewed-by: Takahiro Kuwano Link: https://lore.kernel.org/r/20220810220654.1297699-4-michael@walle.cc --- include/linux/mtd/spi-nor.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 42218a1164f6..25765556223a 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -349,6 +349,8 @@ struct spi_nor_flash_parameter; * @bouncebuf: bounce buffer used when the buffer passed by the MTD * layer is not DMA-able * @bouncebuf_size: size of the bounce buffer + * @id: The flash's ID bytes. Always contains + * SPI_NOR_MAX_ID_LEN bytes. * @info: SPI NOR part JEDEC MFR ID and other info * @manufacturer: SPI NOR manufacturer * @addr_nbytes: number of address bytes @@ -379,6 +381,7 @@ struct spi_nor { struct spi_mem *spimem; u8 *bouncebuf; size_t bouncebuf_size; + u8 *id; const struct flash_info *info; const struct spi_nor_manufacturer *manufacturer; u8 addr_nbytes; -- cgit From c21dd79e9909c9cf90f8b1ca8ad2753cedb6c655 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Thu, 17 Nov 2022 12:35:42 +0000 Subject: pinconf-generic: fix style issues in pin_config_param doc Fixes following issues introduced in a previous commit to clarify values for pin config pull up and down types. - replace spaces with tabs to be consistent with rest of the doc - use capitalization for unit (ohms -> Ohms) Signed-off-by: Niyas Sait Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221117123542.1154252-1-niyas.sait@linaro.org Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index c9c10142657e..d74b7a4ea154 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -39,7 +39,7 @@ struct pinctrl_map; * @PIN_CONFIG_BIAS_PULL_DOWN: the pin will be pulled down (usually with high * impedance to GROUND). If the argument is != 0 pull-down is enabled, * the value is interpreted by the driver and can be custom or an SI unit - * such as ohms. + * such as Ohms. * @PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: the pin will be pulled up or down based * on embedded knowledge of the controller hardware, like current mux * function. The pull direction and possibly strength too will normally @@ -51,7 +51,7 @@ struct pinctrl_map; * @PIN_CONFIG_BIAS_PULL_UP: the pin will be pulled up (usually with high * impedance to VDD). If the argument is != 0 pull-up is enabled, * the value is interpreted by the driver and can be custom or an SI unit - * such as ohms. + * such as Ohms. * @PIN_CONFIG_DRIVE_OPEN_DRAIN: the pin will be driven with open drain (open * collector) which means it is usually wired with other output ports * which are then pulled up with an external resistor. Setting this -- cgit From fce3caea0f241f5d34855c82c399d5e0e2d91f07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:42 +0100 Subject: blk-crypto: don't use struct request_queue for public interfaces Switch all public blk-crypto interfaces to use struct block_device arguments to specify the device they operate on instead of th request_queue, which is a block layer implementation detail. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-crypto.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 69b24fe92cbf..561ca92e204d 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -71,9 +71,6 @@ struct bio_crypt_ctx { #include #include -struct request; -struct request_queue; - #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) @@ -94,13 +91,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, unsigned int dun_bytes, unsigned int data_unit_size); -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q); +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key); -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -- cgit From 6715c98b6cf003f26b1b2f655393134e9d999a05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:43 +0100 Subject: blk-crypto: add a blk_crypto_config_supported_natively helper Add a blk_crypto_config_supported_natively helper that wraps __blk_crypto_cfg_supported to retrieve the crypto_profile from the request queue. With this fscrypt can stop including blk-crypto-profile.h and rely on the public consumer interface in blk-crypto.h. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-crypto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 561ca92e204d..a33d32f5c268 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -97,6 +97,8 @@ int blk_crypto_start_using_key(struct block_device *bdev, int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg); bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); -- cgit From 3569788c08235c6f3e9e6ca724b2df44787ff487 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:44 +0100 Subject: blk-crypto: move internal only declarations to blk-crypto-internal.h blk_crypto_get_keyslot, blk_crypto_put_keyslot, __blk_crypto_evict_key and __blk_crypto_cfg_supported are only used internally by the blk-crypto code, so move the out of blk-crypto-profile.h, which is included by drivers that supply blk-crypto functionality. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index bbab65bd5428..e6802b69cdd6 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -138,18 +138,6 @@ int devm_blk_crypto_profile_init(struct device *dev, unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot); -blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, - const struct blk_crypto_key *key, - struct blk_crypto_keyslot **slot_ptr); - -void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); - -bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, - const struct blk_crypto_config *cfg); - -int __blk_crypto_evict_key(struct blk_crypto_profile *profile, - const struct blk_crypto_key *key); - void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); -- cgit From d28a1de5d1128d6caef15fdc04d3f6484dc7264b Mon Sep 17 00:00:00 2001 From: Liam Beguin Date: Fri, 18 Nov 2022 13:23:07 -0500 Subject: math64: favor kernel-doc from header files Fix the kernel-doc markings for div64 functions to point to the header file instead of the lib/ directory. This avoids having implementation specific comments in generic documentation. Furthermore, given that some kernel-doc comments are identical, drop them from lib/math64 and only keep there comments that add implementation details. Signed-off-by: Liam Beguin Acked-by: Randy Dunlap Tested-by: Randy Dunlap Link: https://lore.kernel.org/r/20221118182309.3824530-1-liambeguin@gmail.com Signed-off-by: Jonathan Corbet --- include/linux/math64.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index a14f40de1dca..1538844fcb51 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -29,7 +29,7 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) return dividend / divisor; } -/* +/** * div_s64_rem - signed 64bit divide with 32bit divisor with remainder * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor @@ -43,7 +43,7 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) return dividend / divisor; } -/* +/** * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor @@ -57,7 +57,7 @@ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) return dividend / divisor; } -/* +/** * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor @@ -69,7 +69,7 @@ static inline u64 div64_u64(u64 dividend, u64 divisor) return dividend / divisor; } -/* +/** * div64_s64 - signed 64bit divide with 64bit divisor * @dividend: signed 64bit dividend * @divisor: signed 64bit divisor @@ -300,7 +300,7 @@ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); #define DIV64_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); }) -/* +/** * DIV_U64_ROUND_CLOSEST - unsigned 64bit divide with 32bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor @@ -313,7 +313,7 @@ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); #define DIV_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u32 _tmp = (divisor); div_u64((u64)(dividend) + _tmp / 2, _tmp); }) -/* +/** * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor -- cgit From 090f13cac84d618d8d391e72646bea462aae9486 Mon Sep 17 00:00:00 2001 From: Liam Beguin Date: Fri, 18 Nov 2022 13:23:08 -0500 Subject: math64: add kernel-doc for DIV64_U64_ROUND_UP Add kernel-doc for DIV64_U64_ROUND_UP so that it appears in the documentation. Signed-off-by: Liam Beguin Acked-by: Randy Dunlap Tested-by: Randy Dunlap Link: https://lore.kernel.org/r/20221118182309.3824530-2-liambeguin@gmail.com Signed-off-by: Jonathan Corbet --- include/linux/math64.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index 1538844fcb51..cf3b0099674a 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -284,6 +284,16 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); +/** + * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up + * @ll: unsigned 64bit dividend + * @d: unsigned 64bit divisor + * + * Divide unsigned 64bit dividend by unsigned 64bit divisor + * and round up. + * + * Return: dividend / divisor rounded up + */ #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) -- cgit From a898db21cc8ff4205784253355d330517983bbbb Mon Sep 17 00:00:00 2001 From: Liam Beguin Date: Fri, 18 Nov 2022 13:23:09 -0500 Subject: math64: fix kernel-doc return value warnings Fix the following kernel-doc warnings by adding a description for return values of div_[us]64. math64.h:126: warning: No description found for return value of 'div_u64' math64.h:139: warning: No description found for return value of 'div_s64' Signed-off-by: Liam Beguin Link: https://lore.kernel.org/r/20221118182309.3824530-3-liambeguin@gmail.com Signed-off-by: Jonathan Corbet --- include/linux/math64.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index cf3b0099674a..8958f4c005c1 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -120,6 +120,8 @@ extern s64 div64_s64(s64 dividend, s64 divisor); * This is the most common 64bit divide and should be used if possible, * as many 32bit archs can optimize this variant better than a full 64bit * divide. + * + * Return: dividend / divisor */ #ifndef div_u64 static inline u64 div_u64(u64 dividend, u32 divisor) @@ -133,6 +135,8 @@ static inline u64 div_u64(u64 dividend, u32 divisor) * div_s64 - signed 64bit divide with 32bit divisor * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor + * + * Return: dividend / divisor */ #ifndef div_s64 static inline s64 div_s64(s64 dividend, s32 divisor) -- cgit From 870c2481174b839e7159555127bc8b5a5d0699ba Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 31 May 2022 09:14:03 +0300 Subject: net/mlx5: cmdif, Print info on any firmware cmd failure to tracepoint While moving to new CMD API (quiet API), some pre-existing flows may call the new API function that in case of error, returns the error instead of printing it as previously done. For such flows we bring back the print but to tracepoint this time for sys admins to have the ability to check for errors especially for commands using the new quiet API. Tracepoint output example: devlink-1333 [001] ..... 822.746922: mlx5_cmd: ACCESS_REG(0x805) op_mod(0x0) failed, status bad resource(0x5), syndrome (0xb06e1f), err(-22) Fixes: f23519e542e5 ("net/mlx5: cmdif, Add new api for command execution") Signed-off-by: Moshe Shemesh Reviewed-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af2ceb4160bc..06cbad166225 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -981,6 +981,7 @@ struct mlx5_async_work { struct mlx5_async_ctx *ctx; mlx5_async_cbk_t user_callback; u16 opcode; /* cmd opcode */ + u16 op_mod; /* cmd op_mod */ void *out; /* pointer to the cmd output buffer */ }; -- cgit From a6d99022e56e8c1ddc4c75895ed9e3ce5da88453 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 21 Nov 2022 16:08:42 +0100 Subject: regmap: add regmap_might_sleep() With the dawn of MMIO gpio-regmap users, it is desirable to let gpio-regmap ask the regmap if it might sleep during an access so it can pass that information to gpiochip. Add a new regmap_might_sleep() to query the regmap. Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20221121150843.1562603-1-michael@walle.cc Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index ca3434dca3a0..3faf5d5dbb26 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1219,6 +1219,7 @@ static inline int regmap_write_bits(struct regmap *map, unsigned int reg, int regmap_get_val_bytes(struct regmap *map); int regmap_get_max_register(struct regmap *map); int regmap_get_reg_stride(struct regmap *map); +bool regmap_might_sleep(struct regmap *map); int regmap_async_complete(struct regmap *map); bool regmap_can_raw_write(struct regmap *map); size_t regmap_get_raw_read_max(struct regmap *map); @@ -1905,6 +1906,12 @@ static inline int regmap_get_reg_stride(struct regmap *map) return -EINVAL; } +static inline bool regmap_might_sleep(struct regmap *map) +{ + WARN_ONCE(1, "regmap API is disabled"); + return true; +} + static inline int regcache_sync(struct regmap *map) { WARN_ONCE(1, "regmap API is disabled"); -- cgit From 03e02acda8e267a8183e1e0ed289ff1ef9cd7ed8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:13:44 -0700 Subject: eventfd: provide a eventfd_signal_mask() helper This is identical to eventfd_signal(), but it allows the caller to pass in a mask to be used for the poll wakeup key. The use case is avoiding repeated multishot triggers if we have a dependency between eventfd and io_uring. If we setup an eventfd context and register that as the io_uring eventfd, and at the same time queue a multishot poll request for the eventfd context, then any CQE posted will repeatedly trigger the multishot request until it terminates when the CQ ring overflows. In preparation for io_uring detecting this circular dependency, add the mentioned helper so that io_uring can pass in EPOLL_URING as part of the poll wakeup key. Cc: stable@vger.kernel.org # 6.0 [axboe: fold in !CONFIG_EVENTFD fix from Zhang Qilong] Signed-off-by: Jens Axboe --- include/linux/eventfd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 30eb30d6909b..786824f58d3d 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -66,6 +67,12 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx, int n) return -ENOSYS; } +static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, + unsigned mask) +{ + return -ENOSYS; +} + static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) { -- cgit From bbc7e1bed1f5297581325e739f0e47f650a386fa Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 16 Nov 2022 17:16:37 +0100 Subject: random: add back async readiness notifier This is required by vsprint, because it can't do things synchronously from hardirq context, and it will be useful for an EFI notifier as well. I didn't initially want to do this, but with two potential consumers now, it seems worth it. Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 579117d83eb8..b1a34181eed6 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -120,6 +120,7 @@ void __init random_init_early(const char *command_line); void __init random_init(void); bool rng_is_initialized(void); int wait_for_random_bytes(void); +int execute_with_initialized_rng(struct notifier_block *nb); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). * Returns the result of the call to wait_for_random_bytes. */ -- cgit From 88da4e8113110d5f4ebdd2f8cd0899e300cd1954 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Nov 2022 13:08:00 +0200 Subject: pwm: Add a stub for devm_pwmchip_add() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devm_pwmchip_add() can be called by a module that optionally instantiates PWM chip. In the case of CONFIG_PWM=n, the compilation can't be performed. Hence, add a necessary stub. Signed-off-by: Andy Shevchenko Acked-by: Thierry Reding Reviewed-by: Mika Westerberg Acked-by: Uwe Kleine-König --- include/linux/pwm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index d70c6e5a839d..bba492eea96c 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -478,6 +478,11 @@ static inline int pwmchip_remove(struct pwm_chip *chip) return -EINVAL; } +static inline int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip) +{ + return -EINVAL; +} + static inline struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, unsigned int index, const char *label) -- cgit From 2fd36aa0ad1c714d0c92a025ae28c8721ffe108e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Nov 2022 13:08:03 +0200 Subject: pwm: lpss: Allow other drivers to enable PWM LPSS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PWM LPSS device can be embedded in another device. In order to enable it, allow that drivers to probe a corresponding device. Signed-off-by: Andy Shevchenko Acked-by: Thierry Reding Reviewed-by: Mika Westerberg Reviewed-by: Hans de Goede Acked-by: Uwe Kleine-König --- include/linux/platform_data/x86/pwm-lpss.h | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/linux/platform_data/x86/pwm-lpss.h (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pwm-lpss.h b/include/linux/platform_data/x86/pwm-lpss.h new file mode 100644 index 000000000000..296bd837ddbb --- /dev/null +++ b/include/linux/platform_data/x86/pwm-lpss.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Intel Low Power Subsystem PWM controller driver */ + +#ifndef __PLATFORM_DATA_X86_PWM_LPSS_H +#define __PLATFORM_DATA_X86_PWM_LPSS_H + +#include + +struct device; + +struct pwm_lpss_chip; + +struct pwm_lpss_boardinfo { + unsigned long clk_rate; + unsigned int npwm; + unsigned long base_unit_bits; + /* + * Some versions of the IP may stuck in the state machine if enable + * bit is not set, and hence update bit will show busy status till + * the reset. For the rest it may be otherwise. + */ + bool bypass; + /* + * On some devices the _PS0/_PS3 AML code of the GPU (GFX0) device + * messes with the PWM0 controllers state, + */ + bool other_devices_aml_touches_pwm_regs; +}; + +struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, void __iomem *base, + const struct pwm_lpss_boardinfo *info); + +#endif /* __PLATFORM_DATA_X86_PWM_LPSS_H */ -- cgit From f0f31de35644537a18059dac0e4a06f8b1c1c6c0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Nov 2022 13:08:04 +0200 Subject: pwm: lpss: Rename pwm_lpss_probe() --> devm_pwm_lpss_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pwm_lpss_probe() uses managed resources. Show this to the users explicitly by adding devm prefix to its name. Signed-off-by: Andy Shevchenko Acked-by: Uwe Kleine-König --- include/linux/platform_data/x86/pwm-lpss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pwm-lpss.h b/include/linux/platform_data/x86/pwm-lpss.h index 296bd837ddbb..c852fe24fe2a 100644 --- a/include/linux/platform_data/x86/pwm-lpss.h +++ b/include/linux/platform_data/x86/pwm-lpss.h @@ -27,7 +27,7 @@ struct pwm_lpss_boardinfo { bool other_devices_aml_touches_pwm_regs; }; -struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, void __iomem *base, - const struct pwm_lpss_boardinfo *info); +struct pwm_lpss_chip *devm_pwm_lpss_probe(struct device *dev, void __iomem *base, + const struct pwm_lpss_boardinfo *info); #endif /* __PLATFORM_DATA_X86_PWM_LPSS_H */ -- cgit From 02a476d932287cf3096f78962ccb70d94d6203c6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Nov 2022 10:46:45 +0100 Subject: kobject: make kobject_get_ownership() take a constant kobject * The call, kobject_get_ownership(), does not modify the kobject passed into it, so make it const. This propagates down into the kobj_type function callbacks so make the kobject passed into them also const, ensuring that nothing in the kobject is being changed here. This helps make it more obvious what calls and callbacks do, and do not, modify structures passed to them. Cc: Trond Myklebust Cc: Anna Schumaker Cc: Roopa Prabhu Cc: "David S. Miller" Cc: Eric Dumazet Cc: Paolo Abeni Cc: Chuck Lever Cc: Jeff Layton Cc: linux-nfs@vger.kernel.org Cc: bridge@lists.linux-foundation.org Cc: netdev@vger.kernel.org Acked-by: Jakub Kicinski Acked-by: Rafael J. Wysocki Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20221121094649.1556002-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index fc40fc81aeb1..d978dbceb50d 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -113,7 +113,7 @@ extern struct kobject * __must_check kobject_get_unless_zero( extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); -extern void kobject_get_ownership(struct kobject *kobj, +extern void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); extern char *kobject_get_path(const struct kobject *kobj, gfp_t flag); @@ -121,9 +121,9 @@ struct kobj_type { void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; const struct attribute_group **default_groups; - const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); - const void *(*namespace)(struct kobject *kobj); - void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); + const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj); + const void *(*namespace)(const struct kobject *kobj); + void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; struct kobj_uevent_env { -- cgit From 542aa24646ca20ccedb70829a95254ce602cdcbd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Nov 2022 10:46:46 +0100 Subject: kobject: make kobject_namespace take a const * kobject_namespace() should take a const *kobject as it does not modify the kobject passed to it. Change that, and the functions kobj_child_ns_ops() and kobj_ns_ops() needed to also be changed to const *. Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221121094649.1556002-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- include/linux/kobject_ns.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index d978dbceb50d..5a2d58e10bf5 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -112,7 +112,7 @@ extern struct kobject * __must_check kobject_get_unless_zero( struct kobject *kobj); extern void kobject_put(struct kobject *kobj); -extern const void *kobject_namespace(struct kobject *kobj); +extern const void *kobject_namespace(const struct kobject *kobj); extern void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); extern char *kobject_get_path(const struct kobject *kobj, gfp_t flag); diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 2b5b64256cf4..be707748e7ce 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -47,8 +47,8 @@ struct kobj_ns_type_operations { int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); int kobj_ns_type_registered(enum kobj_ns_type type); -const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); -const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent); +const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj); bool kobj_ns_current_may_mount(enum kobj_ns_type type); void *kobj_ns_grab_current(enum kobj_ns_type type); -- cgit From c45a88bb3f6cdaeb29d8ee98463610ad815721ab Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Nov 2022 10:46:47 +0100 Subject: kobject: kset_uevent_ops: make filter() callback take a const * MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filter() callback in struct kset_uevent_ops does not modify the kobject passed into it, so make the pointer const to enforce this restriction. When doing so, fix up all existing filter() callbacks to have the correct signature to preserve the build. Cc: Sumit Semwal Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Acked-by: Rafael J. Wysocki Acked-by: Christian König for the changes to Link: https://lore.kernel.org/r/20221121094649.1556002-3-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 5a2d58e10bf5..640f59d4b3de 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -135,7 +135,7 @@ struct kobj_uevent_env { }; struct kset_uevent_ops { - int (* const filter)(struct kobject *kobj); + int (* const filter)(const struct kobject *kobj); const char *(* const name)(struct kobject *kobj); int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env); }; -- cgit From a53d1acc978321734a8fd7388f2c050a7219ab69 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Nov 2022 10:46:48 +0100 Subject: kobject: kset_uevent_ops: make name() callback take a const * The name() callback in struct kset_uevent_ops does not modify the kobject passed into it, so make the pointer const to enforce this restriction. When doing so, fix up the single existing name() callback to have the correct signature to preserve the build. Acked-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221121094649.1556002-4-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 640f59d4b3de..58a5b75612e3 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -136,7 +136,7 @@ struct kobj_uevent_env { struct kset_uevent_ops { int (* const filter)(const struct kobject *kobj); - const char *(* const name)(struct kobject *kobj); + const char *(* const name)(const struct kobject *kobj); int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env); }; -- cgit From 9f041c5d8296b3a04cf3ead473a124fb538490dc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Nov 2022 10:46:49 +0100 Subject: driver core: pass a const * into of_device_uevent() of_device_uevent() does not modify the struct device * passed into it, so make it a const * to enforce this. Also the documentation for the function was really wrong so fix that up at the same time. Cc: Rob Herring Cc: Frank Rowand Cc: devicetree@vger.kernel.org Link: https://lore.kernel.org/r/20221121094649.1556002-5-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/of_device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 1a803e4335d3..ab7d557d541d 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -35,7 +35,7 @@ extern const void *of_device_get_match_data(const struct device *dev); extern ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len); extern int of_device_request_module(struct device *dev); -extern void of_device_uevent(struct device *dev, struct kobj_uevent_env *env); +extern void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env); extern int of_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env); static inline struct device_node *of_cpu_device_node_get(int cpu) @@ -64,7 +64,7 @@ static inline int of_driver_match_device(struct device *dev, return 0; } -static inline void of_device_uevent(struct device *dev, +static inline void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { } static inline const void *of_device_get_match_data(const struct device *dev) -- cgit From 96e8298945010d4b0a0c21841566401848a42afc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 14 Nov 2022 17:18:13 +0200 Subject: serdev: Replace poll loop by readx_poll_timeout() macro The readx_poll_timeout() consolidates the necessary code under macro. Replace current code with it. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221114151813.37294-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 66f624fc618c..5f6bfe4f6d95 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -279,18 +280,10 @@ static inline bool serdev_device_get_cts(struct serdev_device *serdev) static inline int serdev_device_wait_for_cts(struct serdev_device *serdev, bool state, int timeout_ms) { - unsigned long timeout; bool signal; - timeout = jiffies + msecs_to_jiffies(timeout_ms); - while (time_is_after_jiffies(timeout)) { - signal = serdev_device_get_cts(serdev); - if (signal == state) - return 0; - usleep_range(1000, 2000); - } - - return -ETIMEDOUT; + return readx_poll_timeout(serdev_device_get_cts, serdev, signal, signal == state, + 2000, timeout_ms * 1000); } static inline int serdev_device_set_rts(struct serdev_device *serdev, bool enable) -- cgit From 9c7babf94a0d686b552e53aded8d4703d1b8b92b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:44:38 +1100 Subject: xfs,iomap: move delalloc punching to iomap Because that's what Christoph wants for this error handling path only XFS uses. It requires a new iomap export for handling errors over delalloc ranges. This is basically the XFS code as is stands, but even though Christoph wants this as iomap funcitonality, we still have to call it from the filesystem specific ->iomap_end callback, and call into the iomap code with yet another filesystem specific callback to punch the delalloc extent within the defined ranges. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 238a03087e17..0698c4b8ce0e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -226,6 +226,10 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)); + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -- cgit From 2cd10a496a86787367716b684dadfecbb594095b Mon Sep 17 00:00:00 2001 From: Joel Colledge Date: Tue, 22 Nov 2022 14:43:00 +0100 Subject: lru_cache: remove unused lc_private, lc_set, lc_index_of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Joel Colledge Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20221122134301.69258-4-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- include/linux/lru_cache.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 07add7882a5d..c9afcdd9324c 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -199,7 +199,6 @@ struct lru_cache { unsigned long flags; - void *lc_private; const char *name; /* nr_elements there */ @@ -241,7 +240,6 @@ extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, unsigned e_count, size_t e_size, size_t e_off); extern void lc_reset(struct lru_cache *lc); extern void lc_destroy(struct lru_cache *lc); -extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); extern void lc_del(struct lru_cache *lc, struct lc_element *element); extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); @@ -297,6 +295,5 @@ extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); container_of(ptr, type, member) extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); -extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); #endif -- cgit From 50c697215a8cc22f0e58c88f06f2716c05a26e85 Mon Sep 17 00:00:00 2001 From: Sam James Date: Wed, 16 Nov 2022 18:26:34 +0000 Subject: kbuild: fix -Wimplicit-function-declaration in license_is_gpl_compatible Add missing include for strcmp. Clang 16 makes -Wimplicit-function-declaration an error by default. Unfortunately, out of tree modules may use this in configure scripts, which means failure might cause silent miscompilation or misconfiguration. For more information, see LWN.net [0] or LLVM's Discourse [1], gentoo-dev@ [2], or the (new) c-std-porting mailing list [3]. [0] https://lwn.net/Articles/913505/ [1] https://discourse.llvm.org/t/configure-script-breakage-with-the-new-werror-implicit-function-declaration/65213 [2] https://archives.gentoo.org/gentoo-dev/message/dd9f2d3082b8b6f8dfbccb0639e6e240 [3] hosted at lists.linux.dev. [akpm@linux-foundation.org: remember "linux/"] Link: https://lkml.kernel.org/r/20221116182634.2823136-1-sam@gentoo.org Signed-off-by: Sam James Cc: Signed-off-by: Andrew Morton --- include/linux/license.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/license.h b/include/linux/license.h index 7cce390f120b..ad937f57f2cb 100644 --- a/include/linux/license.h +++ b/include/linux/license.h @@ -2,6 +2,8 @@ #ifndef __LICENSE_H #define __LICENSE_H +#include + static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 -- cgit From ea4452de2ae987342fadbdd2c044034e6480daad Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 18 Nov 2022 18:00:11 +0800 Subject: mm: fix unexpected changes to {failslab|fail_page_alloc}.attr When we specify __GFP_NOWARN, we only expect that no warnings will be issued for current caller. But in the __should_failslab() and __should_fail_alloc_page(), the local GFP flags alter the global {failslab|fail_page_alloc}.attr, which is persistent and shared by all tasks. This is not what we expected, let's fix it. [akpm@linux-foundation.org: unexport should_fail_ex()] Link: https://lkml.kernel.org/r/20221118100011.2634-1-zhengqi.arch@bytedance.com Fixes: 3f913fc5f974 ("mm: fix missing handler for __GFP_NOWARN") Signed-off-by: Qi Zheng Reported-by: Dmitry Vyukov Reviewed-by: Akinobu Mita Reviewed-by: Jason Gunthorpe Cc: Akinobu Mita Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 9f6e25467844..444236dadcf0 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -20,7 +20,6 @@ struct fault_attr { atomic_t space; unsigned long verbose; bool task_filter; - bool no_warn; unsigned long stacktrace_depth; unsigned long require_start; unsigned long require_end; @@ -32,6 +31,10 @@ struct fault_attr { struct dentry *dname; }; +enum fault_flags { + FAULT_NOWARN = 1 << 0, +}; + #define FAULT_ATTR_INITIALIZER { \ .interval = 1, \ .times = ATOMIC_INIT(1), \ @@ -40,11 +43,11 @@ struct fault_attr { .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \ .verbose = 2, \ .dname = NULL, \ - .no_warn = false, \ } #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER int setup_fault_attr(struct fault_attr *attr, char *str); +bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags); bool should_fail(struct fault_attr *attr, ssize_t size); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -- cgit From 19d05ea712ecbbb67d302664da5ec58b37b9aece Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:54 +0200 Subject: net: dsa: move tag_8021q headers to their proper place tag_8021q definitions are all over the place. Some are exported to linux/dsa/8021q.h (visible by DSA core, taggers, switch drivers and everyone else), and some are in dsa_priv.h. Move the structures that don't need external visibility into tag_8021q.c, and the ones which don't need the world or switch drivers to see them into tag_8021q.h. We also have the tag_8021q.h inclusion from switch.c, which is basically the entire reason why tag_8021q.c was built into DSA in commit 8b6e638b4be2 ("net: dsa: build tag_8021q.c as part of DSA core"). I still don't know how to better deal with that, so leave it alone. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 3ed117e299ec..f3664ee12170 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -5,28 +5,8 @@ #ifndef _NET_DSA_8021Q_H #define _NET_DSA_8021Q_H -#include -#include #include - -struct dsa_switch; -struct dsa_port; -struct sk_buff; -struct net_device; - -struct dsa_tag_8021q_vlan { - struct list_head list; - int port; - u16 vid; - refcount_t refcount; -}; - -struct dsa_8021q_context { - struct dsa_switch *ds; - struct list_head vlans; - /* EtherType of RX VID, used for filtering on master interface */ - __be16 proto; -}; +#include int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); @@ -38,15 +18,6 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge); -struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, - u16 tpid, u16 tci); - -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *vbid); - -struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, - int vbid); - u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num); u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); -- cgit From 0c0860d190ca430fea27045f714d3bf3b3810a4d Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 17 Nov 2022 12:47:29 +0100 Subject: fbdev: Make fb_modesetting_disabled() static inline Make fb_modesetting_disabled() a static-inline function when it is defined in the header file. Avoid the linker error shown below. ld: drivers/video/fbdev/core/fbmon.o: in function `fb_modesetting_disabled': fbmon.c:(.text+0x1e4): multiple definition of `fb_modesetting_disabled'; drivers/video/fbdev/core/fbmem.o:fbmem.c:(.text+0x1bac): first defined here A bug report is at [1]. Reported-by: Stephen Rothwell Reported-by: kernel test robot Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Fixes: 0ba2fa8cbd29 ("fbdev: Add support for the nomodeset kernel parameter") Cc: Javier Martinez Canillas Cc: Daniel Vetter Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Link: https://lore.kernel.org/dri-devel/20221117183214.2473e745@canb.auug.org.au/T/#u # 1 Link: https://patchwork.freedesktop.org/patch/msgid/20221117114729.7570-1-tzimmermann@suse.de (cherry picked from commit a189b2ee938f6b15ad9f95bdef63f95a3a092418) --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 8dc9635f5bc1..a2a52f730989 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -807,7 +807,7 @@ extern int fb_find_mode(struct fb_var_screeninfo *var, #if defined(CONFIG_VIDEO_NOMODESET) bool fb_modesetting_disabled(const char *drvname); #else -bool fb_modesetting_disabled(const char *drvname) +static inline bool fb_modesetting_disabled(const char *drvname) { return false; } -- cgit From 1ab4de28cb1074dabf684076996d570afbe1dc4e Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 18 Nov 2022 14:35:34 +0100 Subject: Revert "drm/fb-helper: Schedule deferred-I/O worker after writing to framebuffer" This reverts commit 7f5cc4a3e5e4c5a38e5748defc952e45278f7a70. Needed to restore the fbdev damage worker. There have been bug reports about locking order [1] and incorrectly takens branches. [2] Restore the damage worker until these problems have been resovled. Signed-off-by: Thomas Zimmermann Acked-by: Daniel Vetter Link: https://intel-gfx-ci.01.org/tree/drm-tip/fi-kbl-8809g.html # 1 Link: https://lore.kernel.org/dri-devel/20221115115819.23088-6-tzimmermann@suse.de/T/#m06eedc0a468940e4cbbd14ca026733b639bc445a # 2 Link: https://patchwork.freedesktop.org/patch/msgid/20221118133535.9739-3-tzimmermann@suse.de (cherry picked from commit 8b83e1a455382dc667898a525a93f4eb6716cc41) --- include/linux/fb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index a2a52f730989..64889a2b783f 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -663,7 +663,6 @@ extern void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file); extern void fb_deferred_io_cleanup(struct fb_info *info); -extern void fb_deferred_io_schedule_flush(struct fb_info *info); extern int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync); -- cgit From b3ad31f33982497dbc7a66a9d3013b1ac6985dfe Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 09:31:28 +0800 Subject: soundwire: intel: start using hw_ops Before introducing new hardware with completely different register spaces and programming sequences, we need to abstract some of the existing routines in hw_ops that will be platform-specific. For now we only use the 'cnl' ops - after the first Intel platform with SoundWire capabilities. Rather than one big intrusive patch, hw_ops are introduced in this patch so show the dependencies between drivers. Follow-up patches will introduce callbacks for debugfs, power and bus management. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20221111013135.38289-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 2e9fd91572d4..2dbe34b41ef1 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -233,6 +233,7 @@ struct sdw_intel_ctx { * struct sdw_intel_res - Soundwire Intel global resource structure, * typically populated by the DSP driver * + * @hw_ops: abstraction for platform ops * @count: link count * @mmio_base: mmio base of SoundWire registers * @irq: interrupt number @@ -249,6 +250,7 @@ struct sdw_intel_ctx { * @alh_base: sdw alh base. */ struct sdw_intel_res { + const struct sdw_intel_hw_ops *hw_ops; int count; void __iomem *mmio_base; int irq; @@ -292,4 +294,17 @@ irqreturn_t sdw_intel_thread(int irq, void *dev_id); #define SDW_INTEL_QUIRK_MASK_BUS_DISABLE BIT(1) +struct sdw_intel; + +/* struct intel_sdw_hw_ops - SoundWire ops for Intel platforms. + * @pre_bank_switch: helper for bus management + * @post_bank_switch: helper for bus management + */ +struct sdw_intel_hw_ops { + int (*pre_bank_switch)(struct sdw_intel *sdw); + int (*post_bank_switch)(struct sdw_intel *sdw); +}; + +extern const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops; + #endif -- cgit From fb2dc6a0a5f885233d632b1e92be9c0be977b0dc Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 09:31:29 +0800 Subject: soundwire: intel: add debugfs callbacks in hw_ops No functionality change, only add indirection for debugfs helpers. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20221111013135.38289-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 2dbe34b41ef1..211924e4ebf2 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -297,10 +297,15 @@ irqreturn_t sdw_intel_thread(int irq, void *dev_id); struct sdw_intel; /* struct intel_sdw_hw_ops - SoundWire ops for Intel platforms. + * @debugfs_init: initialize all debugfs capabilities + * @debugfs_exit: close and cleanup debugfs capabilities * @pre_bank_switch: helper for bus management * @post_bank_switch: helper for bus management */ struct sdw_intel_hw_ops { + void (*debugfs_init)(struct sdw_intel *sdw); + void (*debugfs_exit)(struct sdw_intel *sdw); + int (*pre_bank_switch)(struct sdw_intel *sdw); int (*post_bank_switch)(struct sdw_intel *sdw); }; -- cgit From b6234bcc6589a0719ec91d810114c0b556a5b88b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 09:31:30 +0800 Subject: soundwire: intel: add register_dai callback in hw_ops No functionality change, only add indirection for DAI registration helper. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20221111013135.38289-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 211924e4ebf2..5be63d4fe62e 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -299,6 +299,7 @@ struct sdw_intel; /* struct intel_sdw_hw_ops - SoundWire ops for Intel platforms. * @debugfs_init: initialize all debugfs capabilities * @debugfs_exit: close and cleanup debugfs capabilities + * @register_dai: read all PDI information and register DAIs * @pre_bank_switch: helper for bus management * @post_bank_switch: helper for bus management */ @@ -306,6 +307,8 @@ struct sdw_intel_hw_ops { void (*debugfs_init)(struct sdw_intel *sdw); void (*debugfs_exit)(struct sdw_intel *sdw); + int (*register_dai)(struct sdw_intel *sdw); + int (*pre_bank_switch)(struct sdw_intel *sdw); int (*post_bank_switch)(struct sdw_intel *sdw); }; -- cgit From 3db0c5a6a2832c7b4b40676299e4bbbe1a96bc8b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 09:31:31 +0800 Subject: soundwire: intel: add bus management callbacks in hw_ops No functionality change, only add indirection for bus management helpers. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20221111013135.38289-5-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 5be63d4fe62e..cee61bc9af8a 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -300,6 +300,11 @@ struct sdw_intel; * @debugfs_init: initialize all debugfs capabilities * @debugfs_exit: close and cleanup debugfs capabilities * @register_dai: read all PDI information and register DAIs + * @check_clock_stop: throw error message if clock is not stopped. + * @start_bus: normal start + * @start_bus_after_reset: start after reset + * @start_bus_after_clock_stop: start after mode0 clock stop + * @stop_bus: stop all bus * @pre_bank_switch: helper for bus management * @post_bank_switch: helper for bus management */ @@ -309,6 +314,12 @@ struct sdw_intel_hw_ops { int (*register_dai)(struct sdw_intel *sdw); + void (*check_clock_stop)(struct sdw_intel *sdw); + int (*start_bus)(struct sdw_intel *sdw); + int (*start_bus_after_reset)(struct sdw_intel *sdw); + int (*start_bus_after_clock_stop)(struct sdw_intel *sdw); + int (*stop_bus)(struct sdw_intel *sdw, bool clock_stop); + int (*pre_bank_switch)(struct sdw_intel *sdw); int (*post_bank_switch)(struct sdw_intel *sdw); }; -- cgit From 49c9ff45991a5a62e040c8b43c89a9ab38a0a91f Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 09:31:32 +0800 Subject: soundwire: intel: add link power management callbacks in hw_ops No functionality change, only add indirection for link power management helpers. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20221111013135.38289-6-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index cee61bc9af8a..81430201b8b9 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -305,6 +305,8 @@ struct sdw_intel; * @start_bus_after_reset: start after reset * @start_bus_after_clock_stop: start after mode0 clock stop * @stop_bus: stop all bus + * @link_power_up: power-up using chip-specific helpers + * @link_power_down: power-down with chip-specific helpers * @pre_bank_switch: helper for bus management * @post_bank_switch: helper for bus management */ @@ -320,6 +322,9 @@ struct sdw_intel_hw_ops { int (*start_bus_after_clock_stop)(struct sdw_intel *sdw); int (*stop_bus)(struct sdw_intel *sdw, bool clock_stop); + int (*link_power_up)(struct sdw_intel *sdw); + int (*link_power_down)(struct sdw_intel *sdw); + int (*pre_bank_switch)(struct sdw_intel *sdw); int (*post_bank_switch)(struct sdw_intel *sdw); }; -- cgit From 36e3b385f35a33a10b792ec46350dd87d79e84dd Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 09:31:33 +0800 Subject: soundwire: intel: add in-band wake callbacks in hw_ops No functionality change, only add indirection for in-band wake management helpers. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20221111013135.38289-7-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 81430201b8b9..0942cd464095 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -307,6 +307,8 @@ struct sdw_intel; * @stop_bus: stop all bus * @link_power_up: power-up using chip-specific helpers * @link_power_down: power-down with chip-specific helpers + * @shim_check_wake: check if a wake was received + * @shim_wake: enable/disable in-band wake management * @pre_bank_switch: helper for bus management * @post_bank_switch: helper for bus management */ @@ -325,6 +327,9 @@ struct sdw_intel_hw_ops { int (*link_power_up)(struct sdw_intel *sdw); int (*link_power_down)(struct sdw_intel *sdw); + int (*shim_check_wake)(struct sdw_intel *sdw); + void (*shim_wake)(struct sdw_intel *sdw, bool wake_enable); + int (*pre_bank_switch)(struct sdw_intel *sdw); int (*post_bank_switch)(struct sdw_intel *sdw); }; -- cgit From 562bb228cebea475cc967c4a53df97ca62aa90b5 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 11 Nov 2022 12:26:51 +0800 Subject: soundwire: intel_init: remove sdw_intel_enable_irq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functionality is implemented with per-chip callbacks, there are no users of this symbol, remove the code. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20221111042653.45520-7-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_intel.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 2e9fd91572d4..d2f581feed67 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -286,8 +286,6 @@ int sdw_intel_startup(struct sdw_intel_ctx *ctx); void sdw_intel_exit(struct sdw_intel_ctx *ctx); -void sdw_intel_enable_irq(void __iomem *mmio_base, bool enable); - irqreturn_t sdw_intel_thread(int irq, void *dev_id); #define SDW_INTEL_QUIRK_MASK_BUS_DISABLE BIT(1) -- cgit From 9f0933ac026f7e54fe096797af9de20724e79097 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 21 Nov 2022 16:31:34 +0000 Subject: fscache: fix OOB Read in __fscache_acquire_volume The type of a->key[0] is char in fscache_volume_same(). If the length of cache volume key is greater than 127, the value of a->key[0] is less than 0. In this case, klen becomes much larger than 255 after type conversion, because the type of klen is size_t. As a result, memcmp() is read out of bounds. This causes a slab-out-of-bounds Read in __fscache_acquire_volume(), as reported by Syzbot. Fix this by changing the type of the stored key to "u8 *" rather than "char *" (it isn't a simple string anyway). Also put in a check that the volume name doesn't exceed NAME_MAX. BUG: KASAN: slab-out-of-bounds in memcmp+0x16f/0x1c0 lib/string.c:757 Read of size 8 at addr ffff888016f3aa90 by task syz-executor344/3613 Call Trace: memcmp+0x16f/0x1c0 lib/string.c:757 memcmp include/linux/fortify-string.h:420 [inline] fscache_volume_same fs/fscache/volume.c:133 [inline] fscache_hash_volume fs/fscache/volume.c:171 [inline] __fscache_acquire_volume+0x76c/0x1080 fs/fscache/volume.c:328 fscache_acquire_volume include/linux/fscache.h:204 [inline] v9fs_cache_session_get_cookie+0x143/0x240 fs/9p/cache.c:34 v9fs_session_init+0x1166/0x1810 fs/9p/v9fs.c:473 v9fs_mount+0xba/0xc90 fs/9p/vfs_super.c:126 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 Fixes: 62ab63352350 ("fscache: Implement volume registration") Reported-by: syzbot+a76f6a6e524cf2080aa3@syzkaller.appspotmail.com Signed-off-by: David Howells Reviewed-by: Zhang Peng Reviewed-by: Jingbo Xu cc: Dominique Martinet cc: Jeff Layton cc: v9fs-developer@lists.sourceforge.net cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/Y3OH+Dmi0QIOK18n@codewreck.org/ # Zhang Peng's v1 fix Link: https://lore.kernel.org/r/20221115140447.2971680-1-zhangpeng362@huawei.com/ # Zhang Peng's v2 fix Link: https://lore.kernel.org/r/166869954095.3793579.8500020902371015443.stgit@warthog.procyon.org.uk/ # v1 Signed-off-by: Linus Torvalds --- include/linux/fscache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 36e5dd84cf59..8e312c8323a8 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -75,7 +75,7 @@ struct fscache_volume { atomic_t n_accesses; /* Number of cache accesses in progress */ unsigned int debug_id; unsigned int key_hash; /* Hash of key string */ - char *key; /* Volume ID, eg. "afs@example.com@1234" */ + u8 *key; /* Volume ID, eg. "afs@example.com@1234" */ struct list_head proc_link; /* Link in /proc/fs/fscache/volumes */ struct hlist_bl_node hash_link; /* Link in hash table */ struct work_struct work; -- cgit From 3223417d1fc44d0a487ad8147b4548f83af24153 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 16 Nov 2022 07:49:28 +0000 Subject: ACPI: FFH: Silence missing prototype warnings Silence the following warnings when built with W=1: | CC drivers/acpi/acpi_ffh.c | warning: no previous prototype for 'acpi_ffh_address_space_arch_setup' [-Wmissing-prototypes] | int __weak acpi_ffh_address_space_arch_setup(void *handler_ctxt, | ^ | CC drivers/acpi/acpi_ffh.c | warning: no previous prototype for 'acpi_ffh_address_space_arch_handler' [-Wmissing-prototypes] | int __weak acpi_ffh_address_space_arch_handler(acpi_integer *value, | ^ Reported-by: kernel test robot Signed-off-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c026c1129cba..78d72730ec65 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1491,6 +1491,10 @@ static inline void acpi_init_pcc(void) { } #ifdef CONFIG_ACPI_FFH void acpi_init_ffh(void); +extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt, + void **region_ctxt); +extern int acpi_ffh_address_space_arch_handler(acpi_integer *value, + void *region_context); #else static inline void acpi_init_ffh(void) { } #endif -- cgit From 40eb28dc17f87cfac69d7755447039e92ac5fbda Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 22 Nov 2022 15:35:57 +0200 Subject: device property: Get rid of __PROPERTY_ENTRY_ARRAY_EL*SIZE*() First of all, _ELEMENT_SIZE() repeats existing sizeof_field() macro. Second, usage of _ARRAY_ELSIZE_LEN() adds unnecessary indirection to the data layout. It's more understandable when the data structure is placed explicitly. That said, get rid of those macros by replacing them with the existing helper and explicit data structure layout. Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20221122133600.49897-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 83674f968a8f..04a6b1433cd9 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -12,6 +12,7 @@ #include #include +#include #include struct device; @@ -303,24 +304,14 @@ struct property_entry { * crafted to avoid gcc-4.4.4's problems with initialization of anon unions * and structs. */ - -#define __PROPERTY_ENTRY_ELEMENT_SIZE(_elem_) \ - sizeof(((struct property_entry *)NULL)->value._elem_[0]) - -#define __PROPERTY_ENTRY_ARRAY_ELSIZE_LEN(_name_, _elsize_, _Type_, \ - _val_, _len_) \ -(struct property_entry) { \ - .name = _name_, \ - .length = (_len_) * (_elsize_), \ - .type = DEV_PROP_##_Type_, \ - { .pointer = _val_ }, \ +#define __PROPERTY_ENTRY_ARRAY_LEN(_name_, _elem_, _Type_, _val_, _len_) \ +(struct property_entry) { \ + .name = _name_, \ + .length = (_len_) * sizeof_field(struct property_entry, value._elem_[0]), \ + .type = DEV_PROP_##_Type_, \ + { .pointer = _val_ }, \ } -#define __PROPERTY_ENTRY_ARRAY_LEN(_name_, _elem_, _Type_, _val_, _len_)\ - __PROPERTY_ENTRY_ARRAY_ELSIZE_LEN(_name_, \ - __PROPERTY_ENTRY_ELEMENT_SIZE(_elem_), \ - _Type_, _val_, _len_) - #define PROPERTY_ENTRY_U8_ARRAY_LEN(_name_, _val_, _len_) \ __PROPERTY_ENTRY_ARRAY_LEN(_name_, u8_data, U8, _val_, _len_) #define PROPERTY_ENTRY_U16_ARRAY_LEN(_name_, _val_, _len_) \ @@ -332,9 +323,12 @@ struct property_entry { #define PROPERTY_ENTRY_STRING_ARRAY_LEN(_name_, _val_, _len_) \ __PROPERTY_ENTRY_ARRAY_LEN(_name_, str, STRING, _val_, _len_) #define PROPERTY_ENTRY_REF_ARRAY_LEN(_name_, _val_, _len_) \ - __PROPERTY_ENTRY_ARRAY_ELSIZE_LEN(_name_, \ - sizeof(struct software_node_ref_args), \ - REF, _val_, _len_) +(struct property_entry) { \ + .name = _name_, \ + .length = (_len_) * sizeof(struct software_node_ref_args), \ + .type = DEV_PROP_REF, \ + { .pointer = _val_ }, \ +} #define PROPERTY_ENTRY_U8_ARRAY(_name_, _val_) \ PROPERTY_ENTRY_U8_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_)) @@ -352,7 +346,7 @@ struct property_entry { #define __PROPERTY_ENTRY_ELEMENT(_name_, _elem_, _Type_, _val_) \ (struct property_entry) { \ .name = _name_, \ - .length = __PROPERTY_ENTRY_ELEMENT_SIZE(_elem_), \ + .length = sizeof_field(struct property_entry, value._elem_[0]), \ .is_inline = true, \ .type = DEV_PROP_##_Type_, \ { .value = { ._elem_[0] = _val_ } }, \ -- cgit From c6c76563bd13871739539e20fd3116159e491f5b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 22 Nov 2022 15:35:58 +0200 Subject: device property: Move PROPERTY_ENTRY_BOOL() a bit down Let's order ARRAY and non-ARRAY macros in the same way. The PROPERTY_ENTRY_BOOL() is special, move it a bit down in the code so it won't break ordering of the rest. Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20221122133600.49897-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 04a6b1433cd9..a1f846a15113 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -322,6 +322,7 @@ struct property_entry { __PROPERTY_ENTRY_ARRAY_LEN(_name_, u64_data, U64, _val_, _len_) #define PROPERTY_ENTRY_STRING_ARRAY_LEN(_name_, _val_, _len_) \ __PROPERTY_ENTRY_ARRAY_LEN(_name_, str, STRING, _val_, _len_) + #define PROPERTY_ENTRY_REF_ARRAY_LEN(_name_, _val_, _len_) \ (struct property_entry) { \ .name = _name_, \ @@ -340,7 +341,7 @@ struct property_entry { PROPERTY_ENTRY_U64_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_)) #define PROPERTY_ENTRY_STRING_ARRAY(_name_, _val_) \ PROPERTY_ENTRY_STRING_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_)) -#define PROPERTY_ENTRY_REF_ARRAY(_name_, _val_) \ +#define PROPERTY_ENTRY_REF_ARRAY(_name_, _val_) \ PROPERTY_ENTRY_REF_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_)) #define __PROPERTY_ENTRY_ELEMENT(_name_, _elem_, _Type_, _val_) \ @@ -363,12 +364,6 @@ struct property_entry { #define PROPERTY_ENTRY_STRING(_name_, _val_) \ __PROPERTY_ENTRY_ELEMENT(_name_, str, STRING, _val_) -#define PROPERTY_ENTRY_BOOL(_name_) \ -(struct property_entry) { \ - .name = _name_, \ - .is_inline = true, \ -} - #define PROPERTY_ENTRY_REF(_name_, _ref_, ...) \ (struct property_entry) { \ .name = _name_, \ @@ -377,9 +372,14 @@ struct property_entry { { .pointer = &SOFTWARE_NODE_REFERENCE(_ref_, ##__VA_ARGS__), }, \ } +#define PROPERTY_ENTRY_BOOL(_name_) \ +(struct property_entry) { \ + .name = _name_, \ + .is_inline = true, \ +} + struct property_entry * property_entries_dup(const struct property_entry *properties); - void property_entries_free(const struct property_entry *properties); bool device_dma_supported(const struct device *dev); -- cgit From d96d30d836380e3b4cb64625d4a1ebe79b796ec5 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 16 Nov 2022 07:49:29 +0000 Subject: ACPI: PM: Silence missing prototype warning Silence the following warning when built with W=1: | CC drivers/acpi/device_pm.c | warning: no previous prototype for function 'acpi_subsys_restore_early' [-Wmissing-prototypes] | int acpi_subsys_restore_early(struct device *dev) | ^ Reported-by: kernel test robot Signed-off-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3015235d65e3..ab1aca74ec21 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1136,6 +1136,7 @@ int acpi_subsys_freeze(struct device *dev); int acpi_subsys_poweroff(struct device *dev); void acpi_ec_mark_gpe_for_wake(void); void acpi_ec_set_gpe_wake_mask(u8 action); +int acpi_subsys_restore_early(struct device *dev); #else static inline int acpi_subsys_prepare(struct device *dev) { return 0; } static inline void acpi_subsys_complete(struct device *dev) {} @@ -1144,6 +1145,7 @@ static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; } static inline int acpi_subsys_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_freeze(struct device *dev) { return 0; } static inline int acpi_subsys_poweroff(struct device *dev) { return 0; } +static inline int acpi_subsys_restore_early(struct device *dev) { return 0; } static inline void acpi_ec_mark_gpe_for_wake(void) {} static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif -- cgit From 3cc36cabc669fffa7f04931e3dd25dc47314ec06 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 4 Oct 2022 15:49:05 +0200 Subject: iio: gyro: itg3200_core: do not use internal iio_dev lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The iio_device lock is only meant for internal use. Hence define a device local lock to protect against concurrent accesses. While at it, properly include "mutex.h" for mutex related APIs. Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20221004134909.1692021-13-nuno.sa@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/gyro/itg3200.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/gyro/itg3200.h b/include/linux/iio/gyro/itg3200.h index a602fe7b84fa..74b6d1cadc86 100644 --- a/include/linux/iio/gyro/itg3200.h +++ b/include/linux/iio/gyro/itg3200.h @@ -102,6 +102,8 @@ struct itg3200 { struct i2c_client *i2c; struct iio_trigger *trig; struct iio_mount_matrix orientation; + /* lock to protect against multiple access to the device */ + struct mutex lock; }; enum ITG3200_SCAN_INDEX { -- cgit From 0a8565425afd8ba0e1a0ea73e21da119ee6dacea Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Wed, 12 Oct 2022 17:16:17 +0200 Subject: iio: core: introduce iio_device_{claim|release}_buffer_mode() APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These APIs are analogous to iio_device_claim_direct_mode() and iio_device_release_direct_mode() but, as the name suggests, with the logic flipped. While this looks odd enough, it will have at least two users (in following changes) and it will be important to move the IIO mlock to the private struct. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221012151620.1725215-2-nuno.sa@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index f0ec8a5e5a7a..9d3bd6379eb8 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -629,6 +629,8 @@ int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev, int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp); int iio_device_claim_direct_mode(struct iio_dev *indio_dev); void iio_device_release_direct_mode(struct iio_dev *indio_dev); +int iio_device_claim_buffer_mode(struct iio_dev *indio_dev); +void iio_device_release_buffer_mode(struct iio_dev *indio_dev); extern struct bus_type iio_bus_type; -- cgit From 16afe125b53f88b855d2713c8ba253d905dcf3cc Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Wed, 12 Oct 2022 17:16:20 +0200 Subject: iio: core: move 'mlock' to 'struct iio_dev_opaque' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that there are no more users accessing 'mlock' directly, we can move it to the iio_dev private structure. Hence, it's now explicit that new driver's should not directly use this lock. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221012151620.1725215-5-nuno.sa@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-opaque.h | 2 ++ include/linux/iio/iio.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index d1f8b30a7c8b..5aec3945555b 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -11,6 +11,7 @@ * checked by device drivers but should be considered * read-only as this is a core internal bit * @driver_module: used to make it harder to undercut users + * @mlock: lock used to prevent simultaneous device state changes * @mlock_key: lockdep class for iio_dev lock * @info_exist_lock: lock to prevent use during removal * @trig_readonly: mark the current trigger immutable @@ -43,6 +44,7 @@ struct iio_dev_opaque { int currentmode; int id; struct module *driver_module; + struct mutex mlock; struct lock_class_key mlock_key; struct mutex info_exist_lock; bool trig_readonly; diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 9d3bd6379eb8..8e0afaaa3f75 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -548,8 +548,6 @@ struct iio_buffer_setup_ops { * and owner * @buffer: [DRIVER] any buffer present * @scan_bytes: [INTERN] num bytes captured to be fed to buffer demux - * @mlock: [INTERN] lock used to prevent simultaneous device state - * changes * @available_scan_masks: [DRIVER] optional array of allowed bitmasks * @masklength: [INTERN] the length of the mask established from * channels @@ -574,7 +572,6 @@ struct iio_dev { struct iio_buffer *buffer; int scan_bytes; - struct mutex mlock; const unsigned long *available_scan_masks; unsigned masklength; -- cgit From 9e855d77b1ec57704d23e25761a97e6e64abed66 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 16 Oct 2022 17:34:02 +0100 Subject: iio: st_sensors: core and lsm9ds0 switch to devm_regulator_bulk_get_enable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These drivers only turns the power on at probe and off via a custom devm_add_action_or_reset() callback. The two regulators were handled separately so also switch to bulk registration. The new devm_regulator_bulk_get_enable() replaces all this boilerplate code. Signed-off-by: Jonathan Cameron Cc: Linus Walleij Cc: Andy Shevchenko Reviewed-by: Matti Vaittinen Reviewed-by: Nuno Sá Link: https://lore.kernel.org/r/20221016163409.320197-8-jic23@kernel.org --- include/linux/iio/common/st_sensors.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index db4a1b260348..f5f3ee57bc70 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -224,8 +224,6 @@ struct st_sensor_settings { * @mount_matrix: The mounting matrix of the sensor. * @sensor_settings: Pointer to the specific sensor settings in use. * @current_fullscale: Maximum range of measure by the sensor. - * @vdd: Pointer to sensor's Vdd power supply - * @vdd_io: Pointer to sensor's Vdd-IO power supply * @regmap: Pointer to specific sensor regmap configuration. * @enabled: Status of the sensor (false->off, true->on). * @odr: Output data rate of the sensor [Hz]. @@ -244,8 +242,6 @@ struct st_sensor_data { struct iio_mount_matrix mount_matrix; struct st_sensor_settings *sensor_settings; struct st_sensor_fullscale_avl *current_fullscale; - struct regulator *vdd; - struct regulator *vdd_io; struct regmap *regmap; bool enabled; -- cgit From 2cc64a23c4e26107887af23a62f8ba3c79ff7ab5 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 3 Oct 2022 11:12:04 +0300 Subject: iio: Add IIO_STATIC_CONST_DEVICE_ATTR Add IIO_STATIC_CONST_DEVICE_ATTR macro for creating an read-only iio_dev_attr which returns constant value. This macro is intended to be used when replacing the IIO_CONST_ATTR - attributes for triggered buffers because the triggered buffer attributes must be of type iio_dev_attr. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/8dd853dd0ef8eb40cb980cc6f6e7a43166de3afb.1664782676.git.mazziesaccount@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/sysfs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/sysfs.h b/include/linux/iio/sysfs.h index e51fba66de4b..de5bb125815c 100644 --- a/include/linux/iio/sysfs.h +++ b/include/linux/iio/sysfs.h @@ -97,6 +97,17 @@ struct iio_const_attr { = { .string = _string, \ .dev_attr = __ATTR(_name, S_IRUGO, iio_read_const_attr, NULL)} +#define IIO_STATIC_CONST_DEVICE_ATTR(_name, _string) \ + static ssize_t iio_const_dev_attr_show_##_name( \ + struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + return sysfs_emit(buf, "%s\n", _string); \ + } \ + static IIO_DEVICE_ATTR(_name, 0444, \ + iio_const_dev_attr_show_##_name, NULL, 0) + /* Generic attributes of onetype or another */ /** -- cgit From 0a33755c4b01ed62a6d025cb585928304f9653d7 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 3 Oct 2022 11:13:53 +0300 Subject: iio: Don't silently expect attribute types The iio_triggered_buffer_setup_ext() and the devm_iio_kfifo_buffer_setup_ext() were changed by commit 15097c7a1adc ("iio: buffer: wrap all buffer attributes into iio_dev_attr") to silently expect that all attributes given in buffer_attrs array are device-attributes. This expectation was not forced by the API - and some drivers did register attributes created by IIO_CONST_ATTR(). When using IIO_CONST_ATTRs the added attribute "wrapping" does not copy the pointer to stored string constant and when the sysfs file is read the kernel will access to invalid location. Change the function signatures to expect an array of iio_dev_attrs to avoid similar errors in the future. Merge conflict resolved whilst applying due to patch crossing with two new drivers (kx022a accelerometer and ad4130 ADC). Signed-off-by: Matti Vaittinen Tested-by: Claudiu Beznea Link: https://lore.kernel.org/r/63f54787a684eb1232f1c5d275a09c786987fe4a.1664782676.git.mazziesaccount@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 2 +- include/linux/iio/kfifo_buf.h | 3 ++- include/linux/iio/triggered_buffer.h | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e2ca8ea23e19..89c3fd7c29ca 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -123,7 +123,7 @@ struct iio_buffer { struct attribute_group buffer_group; /* @attrs: Standard attributes of the buffer. */ - const struct attribute **attrs; + const struct iio_dev_attr **attrs; /* @demux_bounce: Buffer for doing gather from incoming scan. */ void *demux_bounce; diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h index 8a83fb58232d..22874da0c8be 100644 --- a/include/linux/iio/kfifo_buf.h +++ b/include/linux/iio/kfifo_buf.h @@ -5,6 +5,7 @@ struct iio_buffer; struct iio_buffer_setup_ops; struct iio_dev; +struct iio_dev_attr; struct device; struct iio_buffer *iio_kfifo_allocate(void); @@ -13,7 +14,7 @@ void iio_kfifo_free(struct iio_buffer *r); int devm_iio_kfifo_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, const struct iio_buffer_setup_ops *setup_ops, - const struct attribute **buffer_attrs); + const struct iio_dev_attr **buffer_attrs); #define devm_iio_kfifo_buffer_setup(dev, indio_dev, setup_ops) \ devm_iio_kfifo_buffer_setup_ext((dev), (indio_dev), (setup_ops), NULL) diff --git a/include/linux/iio/triggered_buffer.h b/include/linux/iio/triggered_buffer.h index 7490b05fc5b2..29e1fe146879 100644 --- a/include/linux/iio/triggered_buffer.h +++ b/include/linux/iio/triggered_buffer.h @@ -5,8 +5,8 @@ #include #include -struct attribute; struct iio_dev; +struct iio_dev_attr; struct iio_buffer_setup_ops; int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev, @@ -14,7 +14,7 @@ int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev, irqreturn_t (*thread)(int irq, void *p), enum iio_buffer_direction direction, const struct iio_buffer_setup_ops *setup_ops, - const struct attribute **buffer_attrs); + const struct iio_dev_attr **buffer_attrs); void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev); #define iio_triggered_buffer_setup(indio_dev, h, thread, setup_ops) \ @@ -28,7 +28,7 @@ int devm_iio_triggered_buffer_setup_ext(struct device *dev, irqreturn_t (*thread)(int irq, void *p), enum iio_buffer_direction direction, const struct iio_buffer_setup_ops *ops, - const struct attribute **buffer_attrs); + const struct iio_dev_attr **buffer_attrs); #define devm_iio_triggered_buffer_setup(dev, indio_dev, h, thread, setup_ops) \ devm_iio_triggered_buffer_setup_ext((dev), (indio_dev), (h), (thread), \ -- cgit From 99c05e4283a19a02a256f14100ca4ec3b2da3f62 Mon Sep 17 00:00:00 2001 From: Ramona Bolboaca Date: Tue, 22 Nov 2022 10:27:49 +0200 Subject: iio: adis: add '__adis_enable_irq()' implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add '__adis_enable_irq()' implementation which is the unlocked version of 'adis_enable_irq()'. Call '__adis_enable_irq()' instead of 'adis_enable_irq()' from '__adis_intial_startup()' to keep the expected unlocked functionality. This fix is needed to remove a deadlock for all devices which are using 'adis_initial_startup()'. The deadlock occurs because the same mutex is acquired twice, without releasing it. The mutex is acquired once inside 'adis_initial_startup()', before calling '__adis_initial_startup()', and once inside 'adis_enable_irq()', which is called by '__adis_initial_startup()'. The deadlock is removed by calling '__adis_enable_irq()', instead of 'adis_enable_irq()' from within '__adis_initial_startup()'. Fixes: b600bd7eb3335 ("iio: adis: do not disabe IRQs in 'adis_init()'") Signed-off-by: Ramona Bolboaca Reviewed-by: Nuno Sá Link: https://lore.kernel.org/r/20221122082757.449452-2-ramona.bolboaca@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 515ca09764fe..bcbefb757475 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -402,9 +402,20 @@ static inline int adis_update_bits_base(struct adis *adis, unsigned int reg, __adis_update_bits_base(adis, reg, mask, val, sizeof(val)); \ }) -int adis_enable_irq(struct adis *adis, bool enable); int __adis_check_status(struct adis *adis); int __adis_initial_startup(struct adis *adis); +int __adis_enable_irq(struct adis *adis, bool enable); + +static inline int adis_enable_irq(struct adis *adis, bool enable) +{ + int ret; + + mutex_lock(&adis->state_lock); + ret = __adis_enable_irq(adis, enable); + mutex_unlock(&adis->state_lock); + + return ret; +} static inline int adis_check_status(struct adis *adis) { -- cgit From c613afc1f257e1e3229b8dcade43a104a26541c8 Mon Sep 17 00:00:00 2001 From: Ramona Bolboaca Date: Tue, 22 Nov 2022 10:27:57 +0200 Subject: iio: imu: adis: Remove adis_initial_startup function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove adis_initial_startup function since it is not used anymore. Signed-off-by: Ramona Bolboaca Reviewed-by: Nuno Sá Link: https://lore.kernel.org/r/20221122082757.449452-10-ramona.bolboaca@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index bcbefb757475..dc9ea299e088 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -428,18 +428,6 @@ static inline int adis_check_status(struct adis *adis) return ret; } -/* locked version of __adis_initial_startup() */ -static inline int adis_initial_startup(struct adis *adis) -{ - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_initial_startup(adis); - mutex_unlock(&adis->state_lock); - - return ret; -} - static inline void adis_dev_lock(struct adis *adis) { mutex_lock(&adis->state_lock); -- cgit From beb3d47d1d3d7185bb401af628ad32ee204a9526 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 07:57:59 -0800 Subject: bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set With CONFIG_DEBUG_INFO_BTF not set, we hit the following compilation error, /.../kernel/bpf/verifier.c:8196:23: error: array index 6 is past the end of the array (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds] if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) ^ ~~~~~~~~~~~~~~~~~~~~~~~ /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; ^ /.../kernel/bpf/verifier.c:8443:19: error: array index 5 is past the end of the array (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds] btf_id == special_kfunc_list[KF_bpf_list_pop_back]; ^ ~~~~~~~~~~~~~~~~~~~~ /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; ... Fix the problem by increase the size of BTF_ID_LIST to 16 to avoid compilation error and also prevent potentially unintended issue due to out-of-bound access. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Nathan Chancellor Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221123155759.2669749-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index c9744efd202f..93397711a68c 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -204,7 +204,7 @@ extern struct btf_id_set8 name; #else -#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; #define BTF_ID(prefix, name) #define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED -- cgit From 04aabc32fb677f91d676fd306bca1043805e78d5 Mon Sep 17 00:00:00 2001 From: Song Chen Date: Thu, 20 Oct 2022 22:06:51 +0800 Subject: ring_buffer: Remove unused "event" parameter After commit a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp"), the "event" parameter is no longer used in either ring_buffer_unlock_commit() or rb_commit(). Best to remove it. Link: https://lkml.kernel.org/r/1666274811-24138-1-git-send-email-chensong_2000@189.cn Signed-off-by: Song Chen Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3c7d295746f6..782e14f62201 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -113,8 +113,7 @@ void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val); struct ring_buffer_event *ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length); -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event); +int ring_buffer_unlock_commit(struct trace_buffer *buffer); int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data); -- cgit From 96e6122cb79616c622ae0d025eb9f981120b568d Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 10 Nov 2022 10:03:19 +0800 Subject: tracing: Optimize event type allocation with IDA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 060fa5c83e67 ("tracing/events: reuse trace event ids after overflow"), trace events with dynamic type are linked up in list 'ftrace_event_list' through field 'trace_event.list'. Then when max event type number used up, it's possible to reuse type number of some freed one by traversing 'ftrace_event_list'. As instead, using IDA to manage available type numbers can make codes simpler and then the field 'trace_event.list' can be dropped. Since 'struct trace_event' is used in static tracepoints, drop 'trace_event.list' can make vmlinux smaller. Local test with about 2000 tracepoints, vmlinux reduced about 64KB: before:-rwxrwxr-x 1 root root 76669448 Nov 8 17:14 vmlinux after: -rwxrwxr-x 1 root root 76604176 Nov 8 17:15 vmlinux Link: https://lkml.kernel.org/r/20221110020319.1259291-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 20749bd9db71..bb2053246d6a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -136,7 +136,6 @@ struct trace_event_functions { struct trace_event { struct hlist_node node; - struct list_head list; int type; struct trace_event_functions *funcs; }; -- cgit From c0071be0e16c461680d87b763ba1ee5e46548fde Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Mon, 31 Oct 2022 11:07:59 +0200 Subject: net/mlx5e: MACsec, remove replay window size limitation in offload path Currently offload path limits replay window size to 32/64/128/256 bits, such a limitation should not exist since software allows it. Remove such limitation. Fixes: eb43846b43c3 ("net/mlx5e: Support MACsec offload replay window") Signed-off-by: Emeel Hakim Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..981fc7dfa408 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11611,13 +11611,6 @@ enum { MLX5_MACSEC_ASO_REPLAY_PROTECTION = 0x1, }; -enum { - MLX5_MACSEC_ASO_REPLAY_WIN_32BIT = 0x0, - MLX5_MACSEC_ASO_REPLAY_WIN_64BIT = 0x1, - MLX5_MACSEC_ASO_REPLAY_WIN_128BIT = 0x2, - MLX5_MACSEC_ASO_REPLAY_WIN_256BIT = 0x3, -}; - #define MLX5_MACSEC_ASO_INC_SN 0x2 #define MLX5_MACSEC_ASO_REG_C_4_5 0x2 -- cgit From 9a5a305686971f4be10c6d7251c8348d74b3e014 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Nov 2022 21:18:37 +0100 Subject: timers: Get rid of del_singleshot_timer_sync() del_singleshot_timer_sync() used to be an optimization for deleting timers which are not rearmed from the timer callback function. This optimization turned out to be broken and got mapped to del_timer_sync() about 17 years ago. Get rid of the undocumented indirection and use del_timer_sync() directly. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: Jacob Keller Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/20221123201624.706987932@linutronix.de --- include/linux/timer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 648f00105f58..ea35d00123f0 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -190,8 +190,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer); # define del_timer_sync(t) del_timer(t) #endif -#define del_singleshot_timer_sync(t) del_timer_sync(t) - extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -- cgit From 168f6b6ffbeec0b9333f3582e4cf637300858db5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Nov 2022 21:18:42 +0100 Subject: timers: Use del_timer_sync() even on UP del_timer_sync() is assumed to be pointless on uniprocessor systems and can be mapped to del_timer() because in theory del_timer() can never be invoked while the timer callback function is executed. This is not entirely true because del_timer() can be invoked from interrupt context and therefore hit in the middle of a running timer callback. Contrary to that del_timer_sync() is not allowed to be invoked from interrupt context unless the affected timer is marked with TIMER_IRQSAFE. del_timer_sync() has proper checks in place to detect such a situation. Give up on the UP optimization and make del_timer_sync() unconditionally available. Co-developed-by: Steven Rostedt Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: Jacob Keller Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/all/20220407161745.7d6754b3@gandalf.local.home Link: https://lore.kernel.org/all/20221110064101.429013735@goodmis.org Link: https://lore.kernel.org/r/20221123201624.888306160@linutronix.de --- include/linux/timer.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index ea35d00123f0..31b8b6018f0d 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -183,12 +183,7 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); - -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) - extern int del_timer_sync(struct timer_list *timer); -#else -# define del_timer_sync(t) del_timer(t) -#endif +extern int del_timer_sync(struct timer_list *timer); extern void init_timers(void); struct hrtimer; -- cgit From 9b13df3fb64ee95e2397585404e442afee2c7d4f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Nov 2022 21:18:44 +0100 Subject: timers: Rename del_timer_sync() to timer_delete_sync() The timer related functions do not have a strict timer_ prefixed namespace which is really annoying. Rename del_timer_sync() to timer_delete_sync() and provide del_timer_sync() as a wrapper. Document that del_timer_sync() is not for new code. Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: Steven Rostedt (Google) Reviewed-by: Jacob Keller Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/20221123201624.954785441@linutronix.de --- include/linux/timer.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 31b8b6018f0d..551fa467726f 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -183,7 +183,20 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); -extern int del_timer_sync(struct timer_list *timer); +extern int timer_delete_sync(struct timer_list *timer); + +/** + * del_timer_sync - Delete a pending timer and wait for a running callback + * @timer: The timer to be deleted + * + * See timer_delete_sync() for detailed explanation. + * + * Do not use in new code. Use timer_delete_sync() instead. + */ +static inline int del_timer_sync(struct timer_list *timer) +{ + return timer_delete_sync(timer); +} extern void init_timers(void); struct hrtimer; -- cgit From bb663f0f3c396c6d05f6c5eeeea96ced20ff112e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Nov 2022 21:18:45 +0100 Subject: timers: Rename del_timer() to timer_delete() The timer related functions do not have a strict timer_ prefixed namespace which is really annoying. Rename del_timer() to timer_delete() and provide del_timer() as a wrapper. Document that del_timer() is not for new code. Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: Steven Rostedt (Google) Reviewed-by: Jacob Keller Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/20221123201625.015535022@linutronix.de --- include/linux/timer.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 551fa467726f..e338e173ce8b 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -169,7 +169,6 @@ static inline int timer_pending(const struct timer_list * timer) } extern void add_timer_on(struct timer_list *timer, int cpu); -extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); extern int timer_reduce(struct timer_list *timer, unsigned long expires); @@ -184,6 +183,7 @@ extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); +extern int timer_delete(struct timer_list *timer); /** * del_timer_sync - Delete a pending timer and wait for a running callback @@ -198,6 +198,19 @@ static inline int del_timer_sync(struct timer_list *timer) return timer_delete_sync(timer); } +/** + * del_timer - Delete a pending timer + * @timer: The timer to be deleted + * + * See timer_delete() for detailed explanation. + * + * Do not use in new code. Use timer_delete() instead. + */ +static inline int del_timer(struct timer_list *timer) +{ + return timer_delete(timer); +} + extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -- cgit From f571faf6e443b6011ccb585d57866177af1f643c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Nov 2022 21:18:53 +0100 Subject: timers: Provide timer_shutdown[_sync]() Tearing down timers which have circular dependencies to other functionality, e.g. workqueues, where the timer can schedule work and work can arm timers, is not trivial. In those cases it is desired to shutdown the timer in a way which prevents rearming of the timer. The mechanism to do so is to set timer->function to NULL and use this as an indicator for the timer arming functions to ignore the (re)arm request. Expose new interfaces for this: timer_shutdown_sync() and timer_shutdown(). timer_shutdown_sync() has the same functionality as timer_delete_sync() plus the NULL-ification of the timer function. timer_shutdown() has the same functionality as timer_delete() plus the NULL-ification of the timer function. In both cases the rearming of the timer is prevented by silently discarding rearm attempts due to timer->function being NULL. Co-developed-by: Steven Rostedt Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Reviewed-by: Jacob Keller Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/all/20220407161745.7d6754b3@gandalf.local.home Link: https://lore.kernel.org/all/20221110064101.429013735@goodmis.org Link: https://lore.kernel.org/r/20221123201625.314230270@linutronix.de --- include/linux/timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index e338e173ce8b..9162f275819a 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -184,6 +184,8 @@ extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); extern int timer_delete(struct timer_list *timer); +extern int timer_shutdown_sync(struct timer_list *timer); +extern int timer_shutdown(struct timer_list *timer); /** * del_timer_sync - Delete a pending timer and wait for a running callback -- cgit From 26e8f6a75248247982458e8237b98c9fb2ffcf9d Mon Sep 17 00:00:00 2001 From: Heiko Schocher Date: Wed, 23 Nov 2022 08:16:36 +0100 Subject: can: sja1000: fix size of OCR_MODE_MASK define bitfield mode in ocr register has only 2 bits not 3, so correct the OCR_MODE_MASK define. Signed-off-by: Heiko Schocher Link: https://lore.kernel.org/all/20221123071636.2407823-1-hs@denx.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/platform/sja1000.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/can/platform/sja1000.h b/include/linux/can/platform/sja1000.h index 5755ae5a4712..6a869682c120 100644 --- a/include/linux/can/platform/sja1000.h +++ b/include/linux/can/platform/sja1000.h @@ -14,7 +14,7 @@ #define OCR_MODE_TEST 0x01 #define OCR_MODE_NORMAL 0x02 #define OCR_MODE_CLOCK 0x03 -#define OCR_MODE_MASK 0x07 +#define OCR_MODE_MASK 0x03 #define OCR_TX0_INVERT 0x04 #define OCR_TX0_PULLDOWN 0x08 #define OCR_TX0_PULLUP 0x10 -- cgit From 23680f0b7d7f67a935adb38058110d2d81bbe6ea Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 23 Nov 2022 13:25:19 +0100 Subject: driver core: make struct class.dev_uevent() take a const * The dev_uevent() in struct class should not be modifying the device that is passed into it, so mark it as a const * and propagate the function signature changes out into all relevant subsystems that use this callback. Cc: Jens Axboe Cc: Luis Chamberlain Cc: Russ Weight Cc: Jean Delvare Cc: Johan Hovold Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Karsten Keil Cc: Mauro Carvalho Chehab Cc: Keith Busch Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Dominik Brodowski Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Johannes Berg Cc: Wolfram Sang Cc: Raed Salem Cc: Chen Zhongjin Cc: Tetsuo Handa Cc: Avihai Horon Cc: "Matthew Wilcox (Oracle)" Cc: Alan Stern Cc: Colin Ian King Cc: Geert Uytterhoeven Cc: Jakob Koschel Cc: Antoine Tenart Cc: Frederic Weisbecker Cc: Wang Yufen Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-media@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: linux-pm@vger.kernel.org Cc: linux-rdma@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Acked-by: Sebastian Reichel Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221123122523.1332370-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 2 +- include/linux/mISDNif.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 20103e0b03c3..94b1107258e5 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -59,7 +59,7 @@ struct class { const struct attribute_group **dev_groups; struct kobject *dev_kobj; - int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); + int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(struct device *dev, umode_t *mode); void (*class_release)(struct class *class); diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index 7dd1f01ec4f9..7aab4a769736 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -586,7 +586,7 @@ extern struct mISDNclock *mISDN_register_clock(char *, int, clockctl_func_t *, void *); extern void mISDN_unregister_clock(struct mISDNclock *); -static inline struct mISDNdevice *dev_to_mISDN(struct device *dev) +static inline struct mISDNdevice *dev_to_mISDN(const struct device *dev) { if (dev) return dev_get_drvdata(dev); -- cgit From ff62b8e6588fb07bedda7423622c140c4edd66a7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 23 Nov 2022 13:25:20 +0100 Subject: driver core: make struct class.devnode() take a const * MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devnode() in struct class should not be modifying the device that is passed into it, so mark it as a const * and propagate the function signature changes out into all relevant subsystems that use this callback. Cc: Fenghua Yu Cc: Reinette Chatre Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: FUJITA Tomonori Cc: Jens Axboe Cc: Justin Sanders Cc: Arnd Bergmann Cc: Sumit Semwal Cc: Benjamin Gaignard Cc: Liam Mark Cc: Laura Abbott Cc: Brian Starkey Cc: John Stultz Cc: "Christian König" Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Dennis Dalessandro Cc: Dmitry Torokhov Cc: Mauro Carvalho Chehab Cc: Sean Young Cc: Frank Haverkamp Cc: Jiri Slaby Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Alex Williamson Cc: Cornelia Huck Cc: Kees Cook Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Hans Verkuil Cc: Christophe JAILLET Cc: Xie Yongji Cc: Gautam Dawar Cc: Dan Carpenter Cc: Eli Cohen Cc: Parav Pandit Cc: Maxime Coquelin Cc: alsa-devel@alsa-project.org Cc: dri-devel@lists.freedesktop.org Cc: kvm@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Cc: linux-block@vger.kernel.org Cc: linux-input@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-media@vger.kernel.org Cc: linux-rdma@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Link: https://lore.kernel.org/r/20221123122523.1332370-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 94b1107258e5..42cc3fb44a84 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -60,7 +60,7 @@ struct class { struct kobject *dev_kobj; int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env); - char *(*devnode)(struct device *dev, umode_t *mode); + char *(*devnode)(const struct device *dev, umode_t *mode); void (*class_release)(struct class *class); void (*dev_release)(struct device *dev); -- cgit From f2756526450d19bca28714565062a8f286c05049 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Nov 2022 22:30:13 -0800 Subject: genirq/irqreturn: Fix kernel-doc warnings irqreturn.h:6: warning: missing initial short description on line: * enum irqreturn irqreturn.h:15: warning: Enum value 'IRQ_NONE' not described in enum 'irqreturn' irqreturn.h:15: warning: Enum value 'IRQ_HANDLED' not described in enum 'irqreturn' irqreturn.h:15: warning: Enum value 'IRQ_WAKE_THREAD' not described in enum 'irqreturn' Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221124063013.28479-1-rdunlap@infradead.org --- include/linux/irqreturn.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h index bd4c066ad39b..d426c7ad92bf 100644 --- a/include/linux/irqreturn.h +++ b/include/linux/irqreturn.h @@ -3,10 +3,10 @@ #define _LINUX_IRQRETURN_H /** - * enum irqreturn - * @IRQ_NONE interrupt was not from this device or was not handled - * @IRQ_HANDLED interrupt was handled by this device - * @IRQ_WAKE_THREAD handler requests to wake the handler thread + * enum irqreturn - irqreturn type values + * @IRQ_NONE: interrupt was not from this device or was not handled + * @IRQ_HANDLED: interrupt was handled by this device + * @IRQ_WAKE_THREAD: handler requests to wake the handler thread */ enum irqreturn { IRQ_NONE = (0 << 0), -- cgit From 8d84e39d76bd83474b26cb44f4b338635676e7e8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 22 Nov 2022 11:40:32 +0100 Subject: fs: use consistent setgid checks in is_sxid() Now that we made the VFS setgid checking consistent an inode can't be marked security irrelevant even if the setgid bit is still set. Make this function consistent with all other helpers. Note that enforcing consistent setgid stripping checks for file modification and mode- and ownership changes will cause the setgid bit to be lost in more cases than useed to be the case. If an unprivileged user wrote to a non-executable setgid file that they don't have privilege over the setgid bit will be dropped. This will lead to temporary failures in some xfstests until they have been updated. Reported-by: Miklos Szeredi Signed-off-by: Christian Brauner (Microsoft) --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b39c5efca180..0a96f5eeee69 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3527,7 +3527,7 @@ int __init list_bdev_fs_names(char *buf, size_t size); static inline bool is_sxid(umode_t mode) { - return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); + return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct user_namespace *mnt_userns, -- cgit From 5a0f663f0189cf9e031e444f97c029717a99548d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:06 -0800 Subject: compiler_types: Define __rcu as __attribute__((btf_type_tag("rcu"))) Currently, without rcu attribute info in BTF, the verifier treats rcu tagged pointer as a normal pointer. This might be a problem for sleepable program where rcu_read_lock()/unlock() is not available. For example, for a sleepable fentry program, if rcu protected memory access is interleaved with a sleepable helper/kfunc, it is possible the memory access after the sleepable helper/kfunc might be invalid since the object might have been freed then. To prevent such cases, introducing rcu tagging for memory accesses in verifier can help to reject such programs. To enable rcu tagging in BTF, during kernel compilation, define __rcu as attribute btf_type_tag("rcu") so __rcu information can be preserved in dwarf and btf, and later can be used for bpf prog verification. Acked-by: KP Singh Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053206.2373141-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index eb0466236661..7c1afe0f4129 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -49,7 +49,8 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # endif # define __iomem # define __percpu BTF_TYPE_TAG(percpu) -# define __rcu +# define __rcu BTF_TYPE_TAG(rcu) + # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 /* context/locking */ -- cgit From 01685c5bddaa6df3d662c8afed5e5289fcc68e5a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:11 -0800 Subject: bpf: Introduce might_sleep field in bpf_func_proto Introduce bpf_func_proto->might_sleep to indicate a particular helper might sleep. This will make later check whether a helper might be sleepable or not easier. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053211.2373553-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c9eafa67f2a2..43fd7eeeeabb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -682,6 +682,7 @@ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; + bool might_sleep; enum bpf_return_type ret_type; union { struct { -- cgit From 9bb00b2895cbfe0ad410457b605d0a72524168c1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:17 -0800 Subject: bpf: Add kfunc bpf_rcu_read_lock/unlock() Add two kfunc's bpf_rcu_read_lock() and bpf_rcu_read_unlock(). These two kfunc's can be used for all program types. The following is an example about how rcu pointer are used w.r.t. bpf_rcu_read_lock()/bpf_rcu_read_unlock(). struct task_struct { ... struct task_struct *last_wakee; struct task_struct __rcu *real_parent; ... }; Let us say prog does 'task = bpf_get_current_task_btf()' to get a 'task' pointer. The basic rules are: - 'real_parent = task->real_parent' should be inside bpf_rcu_read_lock region. This is to simulate rcu_dereference() operation. The 'real_parent' is marked as MEM_RCU only if (1). task->real_parent is inside bpf_rcu_read_lock region, and (2). task is a trusted ptr. So MEM_RCU marked ptr can be 'trusted' inside the bpf_rcu_read_lock region. - 'last_wakee = real_parent->last_wakee' should be inside bpf_rcu_read_lock region since it tries to access rcu protected memory. - the ptr 'last_wakee' will be marked as PTR_UNTRUSTED since in general it is not clear whether the object pointed by 'last_wakee' is valid or not even inside bpf_rcu_read_lock region. The verifier will reset all rcu pointer register states to untrusted at bpf_rcu_read_unlock() kfunc call site, so any such rcu pointer won't be trusted any more outside the bpf_rcu_read_lock() region. The current implementation does not support nested rcu read lock region in the prog. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053217.2373910-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/bpf_verifier.h | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 43fd7eeeeabb..c6aa6912ea16 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -572,6 +572,9 @@ enum bpf_type_flag { */ PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ + MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 545152ac136c..c05aa6e1f6f5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -344,6 +344,7 @@ struct bpf_verifier_state { u32 id; } active_lock; bool speculative; + bool active_rcu_lock; /* first and last insn idx of this verifier state */ u32 first_insn_idx; @@ -445,6 +446,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ @@ -534,6 +536,7 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; + bool rcu_tag_supported; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; @@ -680,7 +683,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { -- cgit From bdbadfcc37c5c8f9f2a401a18eae71b0c28799ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Sep 2022 20:23:56 -0400 Subject: [elf][non-regset] uninline elf_core_copy_task_fpregs() (and lose pt_regs argument) Don't bother with pointless macros - we are not sharing it with aout coredumps anymore. Just convert the underlying functions to the same arguments (nobody uses regs, actually) and call them elf_core_copy_task_fpregs(). And unexport the entire bunch, while we are at it. [added missing includes in arch/{csky,m68k,um}/kernel/process.c to avoid extra warnings about the lack of externs getting added to huge piles for those files. Pointless, but...] Signed-off-by: Al Viro --- include/linux/elfcore.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index fcf58e16d1e3..9ec81290e3c8 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -94,16 +94,7 @@ static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* return 0; } -extern int dump_fpu (struct pt_regs *, elf_fpregset_t *); - -static inline int elf_core_copy_task_fpregs(struct task_struct *t, struct pt_regs *regs, elf_fpregset_t *fpu) -{ -#ifdef ELF_CORE_COPY_FPREGS - return ELF_CORE_COPY_FPREGS(t, fpu); -#else - return dump_fpu(regs, fpu); -#endif -} +int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu); #ifdef CONFIG_ARCH_BINFMT_ELF_EXTRA_PHDRS /* -- cgit From cda2ed05aade303b7d89844a0333168c3484634a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Oct 2022 13:46:26 +0100 Subject: fs: simplify vfs_get_super Remove the pointless keying argument and associated enum and pass the fill_super callback and a "bool reconf" instead. Also mark the function static given that there are no users outside of super.c. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs_context.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 13fa6f3df8e4..87a34f2fa68d 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -145,20 +145,6 @@ extern void fc_drop_locked(struct fs_context *fc); int reconfigure_single(struct super_block *s, int flags, void *data); -/* - * sget() wrappers to be called from the ->get_tree() op. - */ -enum vfs_get_super_keying { - vfs_get_single_super, /* Only one such superblock may exist */ - vfs_get_single_reconf_super, /* As above, but reconfigure if it exists */ - vfs_get_keyed_super, /* Superblocks with different s_fs_info keys may exist */ - vfs_get_independent_super, /* Multiple independent superblocks may exist */ -}; -extern int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)); - extern int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)); -- cgit From 10bc8e4af65946b727728d7479c028742321b60a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Nov 2022 22:52:49 +0200 Subject: vfs: fix copy_file_range() averts filesystem freeze protection Commit 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies") removed fallback to generic_copy_file_range() for cross-fs cases inside vfs_copy_file_range(). To preserve behavior of nfsd and ksmbd server-side-copy, the fallback to generic_copy_file_range() was added in nfsd and ksmbd code, but that call is missing sb_start_write(), fsnotify hooks and more. Ideally, nfsd and ksmbd would pass a flag to vfs_copy_file_range() that will take care of the fallback, but that code would be subtle and we got vfs_copy_file_range() logic wrong too many times already. Instead, add a flag to explicitly request vfs_copy_file_range() to perform only generic_copy_file_range() and let nfsd and ksmbd use this flag only in the fallback path. This choise keeps the logic changes to minimum in the non-nfsd/ksmbd code paths to reduce the risk of further regressions. Fixes: 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies") Tested-by: Namjae Jeon Tested-by: Luis Henriques Signed-off-by: Amir Goldstein Signed-off-by: Al Viro --- include/linux/fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..59ae95ddb679 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2089,6 +2089,14 @@ struct dir_context { */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) +/* + * These flags control the behavior of vfs_copy_file_range(). + * They are not available to the user via syscall. + * + * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops + */ +#define COPY_FILE_SPLICE (1 << 0) + struct iov_iter; struct io_uring_cmd; -- cgit From 931147ddfa6e9ffd814272f1c0370c4740acbe17 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:54 -0800 Subject: io_uring: allow defer completion for aux posted cqes Multishot ops cannot use the compl_reqs list as the request must stay in the poll list, but that means they need to run each completion without benefiting from batching. Here introduce batching infrastructure for only small (ie 16 byte) CQEs. This restriction is ok because there are no use cases posting 32 byte CQEs. In the ring keep a batch of up to 16 posted results, and flush in the same way as compl_reqs. 16 was chosen through experimentation on a microbenchmark ([1]), as well as trying not to increase the size of the ring too much. This increases the size to 1472 bytes from 1216. [1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10 Gives results: baseline 8309 k/s 8 18807 k/s 16 19338 k/s 32 20134 k/s Suggested-by: Pavel Begunkov Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-5-dylany@meta.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f5b687a787a3..accdfecee953 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -174,7 +174,9 @@ struct io_submit_state { bool plug_started; bool need_plug; unsigned short submit_nr; + unsigned int cqes_count; struct blk_plug plug; + struct io_uring_cqe cqes[16]; }; struct io_ev_fd { -- cgit From 8935002fc37fce1ad211d98a70f2fd42083c0594 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Wed, 7 Sep 2022 11:38:42 +0300 Subject: gpu: host1x: Select context device based on attached IOMMU On Tegra234, engines that are programmed through Host1x channels can be attached to either the NISO0 or NISO1 SMMU. Because of that, when selecting a context device to use with an engine, we need to select one that is also attached to the same SMMU. Add a parameter to host1x_memory_context_alloc to specify which device we are allocating a context for, and use it to pick an appropriate context device. Signed-off-by: Mikko Perttunen [treding@nvidia.com: update !IOMMU_API stub signature] Signed-off-by: Thierry Reding --- include/linux/host1x.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index cb2100d9b0ff..dc55d9d3b94f 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -469,11 +469,13 @@ struct host1x_memory_context { #ifdef CONFIG_IOMMU_API struct host1x_memory_context *host1x_memory_context_alloc(struct host1x *host1x, + struct device *dev, struct pid *pid); void host1x_memory_context_get(struct host1x_memory_context *cd); void host1x_memory_context_put(struct host1x_memory_context *cd); #else static inline struct host1x_memory_context *host1x_memory_context_alloc(struct host1x *host1x, + struct device *dev, struct pid *pid) { return NULL; -- cgit From b2bd0a8c3ab11f355392c7b81aec5187fc0d562e Mon Sep 17 00:00:00 2001 From: Ben Levinsky Date: Mon, 14 Nov 2022 15:39:37 -0800 Subject: firmware: xilinx: Add ZynqMP firmware ioctl enums for RPU configuration. Add ZynqMP firmware ioctl enums for RPU configuration and TCM Nodes for later use via request_node and release_node Signed-off-by: Ben Levinsky Signed-off-by: Tanmay Shah Acked-by: Michal Simek Link: https://lore.kernel.org/r/20221114233940.2096237-4-tanmay.shah@amd.com Signed-off-by: Mathieu Poirier --- include/linux/firmware/xlnx-zynqmp.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 76d2b3ebad84..bdbf855b5eef 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -135,6 +135,10 @@ enum pm_ret_status { }; enum pm_ioctl_id { + IOCTL_GET_RPU_OPER_MODE = 0, + IOCTL_SET_RPU_OPER_MODE = 1, + IOCTL_RPU_BOOT_ADDR_CONFIG = 2, + IOCTL_TCM_COMB_CONFIG = 3, IOCTL_SD_DLL_RESET = 6, IOCTL_SET_SD_TAPDELAY = 7, IOCTL_SET_PLL_FRAC_MODE = 8, @@ -175,6 +179,21 @@ enum pm_query_id { PM_QID_CLOCK_GET_MAX_DIVISOR = 13, }; +enum rpu_oper_mode { + PM_RPU_MODE_LOCKSTEP = 0, + PM_RPU_MODE_SPLIT = 1, +}; + +enum rpu_boot_mem { + PM_RPU_BOOTMEM_LOVEC = 0, + PM_RPU_BOOTMEM_HIVEC = 1, +}; + +enum rpu_tcm_comb { + PM_RPU_TCM_SPLIT = 0, + PM_RPU_TCM_COMB = 1, +}; + enum zynqmp_pm_reset_action { PM_RESET_ACTION_RELEASE = 0, PM_RESET_ACTION_ASSERT = 1, -- cgit From da22a04f4727694e2c562ae4eb61daf77eef0427 Mon Sep 17 00:00:00 2001 From: Ben Levinsky Date: Mon, 14 Nov 2022 15:39:38 -0800 Subject: firmware: xilinx: Add shutdown/wakeup APIs Add shutdown/wakeup a resource eemi operations to shutdown or bringup a resource. Note alignment of args matches convention of other fn's in this file. The reason being that the long fn name results in aligned args that otherwise go over 80 chars so shift right to avoid this Signed-off-by: Ben Levinsky Signed-off-by: Tanmay Shah Acked-by: Michal Simek Link: https://lore.kernel.org/r/20221114233940.2096237-5-tanmay.shah@amd.com Signed-off-by: Mathieu Poirier --- include/linux/firmware/xlnx-zynqmp.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index bdbf855b5eef..ad3f2bd0c470 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -12,6 +12,7 @@ #ifndef __FIRMWARE_ZYNQMP_H__ #define __FIRMWARE_ZYNQMP_H__ +#include #include @@ -87,6 +88,8 @@ enum pm_api_cb_id { enum pm_api_id { PM_GET_API_VERSION = 1, PM_REGISTER_NOTIFIER = 5, + PM_FORCE_POWERDOWN = 8, + PM_REQUEST_WAKEUP = 10, PM_SYSTEM_SHUTDOWN = 12, PM_REQUEST_NODE = 13, PM_RELEASE_NODE = 14, @@ -521,6 +524,12 @@ int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); +int zynqmp_pm_force_pwrdwn(const u32 target, + const enum zynqmp_pm_request_ack ack); +int zynqmp_pm_request_wake(const u32 node, + const bool set_addr, + const u64 address, + const enum zynqmp_pm_request_ack ack); int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, u32 value); @@ -795,6 +804,20 @@ static inline int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset) return -ENODEV; } +static inline int zynqmp_pm_force_pwrdwn(const u32 target, + const enum zynqmp_pm_request_ack ack) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_request_wake(const u32 node, + const bool set_addr, + const u64 address, + const enum zynqmp_pm_request_ack ack) +{ + return -ENODEV; +} + static inline int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value) -- cgit From a5e56980cfb7ecaeb9a207c74e2e90ec544f0bc0 Mon Sep 17 00:00:00 2001 From: Ben Levinsky Date: Mon, 14 Nov 2022 15:39:39 -0800 Subject: firmware: xilinx: Add RPU configuration APIs This patch adds APIs to access to configure RPU and its processor-specific memory. That is query the run-time mode of RPU as either split or lockstep as well as API to set this mode. In addition add APIs to access configuration of the RPUs' tightly coupled memory (TCM). Signed-off-by: Ben Levinsky Signed-off-by: Tanmay Shah Acked-by: Michal Simek Link: https://lore.kernel.org/r/20221114233940.2096237-6-tanmay.shah@amd.com Signed-off-by: Mathieu Poirier --- include/linux/firmware/xlnx-zynqmp.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ad3f2bd0c470..cf92e739fa3b 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -530,6 +530,9 @@ int zynqmp_pm_request_wake(const u32 node, const bool set_addr, const u64 address, const enum zynqmp_pm_request_ack ack); +int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode); +int zynqmp_pm_set_rpu_mode(u32 node_id, u32 arg1); +int zynqmp_pm_set_tcm_config(u32 node_id, u32 arg1); int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, u32 value); @@ -818,6 +821,21 @@ static inline int zynqmp_pm_request_wake(const u32 node, return -ENODEV; } +static inline int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_rpu_mode(u32 node_id, u32 arg1) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_tcm_config(u32 node_id, u32 arg1) +{ + return -ENODEV; +} + static inline int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value) -- cgit From de4eda9de2d957ef2d6a8365a01e26a435e958cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Sep 2022 20:25:47 -0400 Subject: use less confusing names for iov_iter direction initializers READ/WRITE proved to be actively confusing - the meanings are "data destination, as used with read(2)" and "data source, as used with write(2)", but people keep interpreting those as "we read data from it" and "we write data to it", i.e. exactly the wrong way. Call them ITER_DEST and ITER_SOURCE - at least that is harder to misinterpret... Signed-off-by: Al Viro --- include/linux/uio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e3134b14ffd..87fc3d0dda98 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -29,6 +29,9 @@ enum iter_type { ITER_UBUF, }; +#define ITER_SOURCE 1 // == WRITE +#define ITER_DEST 0 // == READ + struct iov_iter_state { size_t iov_offset; size_t count; -- cgit From bf0d29fb51ff5e6c13097dbfed7b99e0e35b4a15 Mon Sep 17 00:00:00 2001 From: Eddie James Date: Wed, 2 Nov 2022 15:51:44 -0500 Subject: regmap: Add FSI bus support Add regmap support for the FSI bus. Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20221102205148.1334459-2-eajames@linux.ibm.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index ca3434dca3a0..e477112fb1c7 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -24,6 +24,7 @@ struct module; struct clk; struct device; struct device_node; +struct fsi_device; struct i2c_client; struct i3c_device; struct irq_domain; @@ -628,6 +629,10 @@ struct regmap *__regmap_init_spi_avmm(struct spi_device *spi, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name); +struct regmap *__regmap_init_fsi(struct fsi_device *fsi_dev, + const struct regmap_config *config, + struct lock_class_key *lock_key, + const char *lock_name); struct regmap *__devm_regmap_init(struct device *dev, const struct regmap_bus *bus, @@ -693,6 +698,11 @@ struct regmap *__devm_regmap_init_spi_avmm(struct spi_device *spi, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name); +struct regmap *__devm_regmap_init_fsi(struct fsi_device *fsi_dev, + const struct regmap_config *config, + struct lock_class_key *lock_key, + const char *lock_name); + /* * Wrapper for regmap_init macros to include a unique lockdep key and name * for each call. No-op if CONFIG_LOCKDEP is not set. @@ -919,6 +929,19 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); __regmap_lockdep_wrapper(__regmap_init_spi_avmm, #config, \ spi, config) +/** + * regmap_init_fsi() - Initialise register map + * + * @fsi_dev: Device that will be interacted with + * @config: Configuration for register map + * + * The return value will be an ERR_PTR() on error or a valid pointer to + * a struct regmap. + */ +#define regmap_init_fsi(fsi_dev, config) \ + __regmap_lockdep_wrapper(__regmap_init_fsi, #config, fsi_dev, \ + config) + /** * devm_regmap_init() - Initialise managed register map * @@ -1148,6 +1171,20 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); __regmap_lockdep_wrapper(__devm_regmap_init_spi_avmm, #config, \ spi, config) +/** + * devm_regmap_init_fsi() - Initialise managed register map + * + * @fsi_dev: Device that will be interacted with + * @config: Configuration for register map + * + * The return value will be an ERR_PTR() on error or a valid pointer + * to a struct regmap. The regmap will be automatically freed by the + * device management code. + */ +#define devm_regmap_init_fsi(fsi_dev, config) \ + __regmap_lockdep_wrapper(__devm_regmap_init_fsi, #config, \ + fsi_dev, config) + int regmap_mmio_attach_clk(struct regmap *map, struct clk *clk); void regmap_mmio_detach_clk(struct regmap *map); void regmap_exit(struct regmap *map); -- cgit From ea258f159da14a710f9cb88656558538b5ba5b76 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 25 Nov 2022 17:13:58 +0800 Subject: get rid of INT_LIMIT, use type_max() instead INT_LIMIT() tries to do what type_max() does, except that type_max() doesn't rely upon undefined behaviour[*], might as well use type_max() instead. [*] if T is an N-bit signed integer type, the maximal value in T is pow(2, N - 1) - 1, all right, but naive expression for that value ends up with a couple of wraparounds and as usual for wraparounds in signed types, that's an undefined behaviour. type_max() takes care to avoid those... Caught-by: UBSAN Suggested-by: Eric Biggers Signed-off-by: Zhen Lei Reviewed-by: Eric Biggers Signed-off-by: Al Viro --- include/linux/fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..a384741b1449 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1131,9 +1131,8 @@ struct file_lock_context { /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX -#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) -#define OFFSET_MAX INT_LIMIT(loff_t) -#define OFFT_OFFSET_MAX INT_LIMIT(off_t) +#define OFFSET_MAX type_max(loff_t) +#define OFFT_OFFSET_MAX type_max(off_t) #endif extern void send_sigio(struct fown_struct *fown, int fd, int band); -- cgit From 346907ceb9d11b9e22677c142b45ff50dd20a66a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 16 Nov 2022 15:56:32 +0100 Subject: mm, slab: ignore hardened usercopy parameters when disabled With CONFIG_HARDENED_USERCOPY not enabled, there are no __check_heap_object() checks happening that would use the struct kmem_cache useroffset and usersize fields. Yet the fields are still initialized, preventing merging of otherwise compatible caches. Also the fields contribute to struct kmem_cache size unnecessarily when unused. Thus #ifdef them out completely when CONFIG_HARDENED_USERCOPY is disabled. In kmem_dump_obj() print object_size instead of usersize, as that's actually the intention. In a quick virtme boot test, this has reduced the number of caches in /proc/slabinfo from 131 to 111. Cc: Kees Cook Signed-off-by: Vlastimil Babka Acked-by: Roman Gushchin Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slab_def.h | 2 ++ include/linux/slub_def.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index f0ffad6a3365..5834bad8ad78 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -80,8 +80,10 @@ struct kmem_cache { unsigned int *random_seq; #endif +#ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ +#endif struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f9c68a9dac04..7ed5e455cbf4 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -136,8 +136,10 @@ struct kmem_cache { struct kasan_cache kasan_info; #endif +#ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ +#endif struct kmem_cache_node *node[MAX_NUMNODES]; }; -- cgit From b1a413a39a1a7694acf3636a52c109821148ecdd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Nov 2022 18:18:39 +0100 Subject: mm, slub: disable SYSFS support with CONFIG_SLUB_TINY Currently SLUB enables its sysfs support depending unconditionally on the general CONFIG_SYSFS setting. To reduce the configuration combination space, make CONFIG_SLUB_TINY disable SLUB's sysfs support by reusing the existing SLAB_SUPPORTS_SYSFS define. It is unlikely that real tiny systems would combine CONFIG_SLUB_TINY with CONFIG_SYSFS, but a randconfig might. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slub_def.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 7ed5e455cbf4..95b67863d5cf 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -144,7 +144,7 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; -#ifdef CONFIG_SYSFS +#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS void sysfs_slab_unlink(struct kmem_cache *); void sysfs_slab_release(struct kmem_cache *); -- cgit From 2f7c1c1396b587e8cfe18a1f0d628cedaae56b6a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Nov 2022 18:19:28 +0100 Subject: mm, slub: don't create kmalloc-rcl caches with CONFIG_SLUB_TINY Distinguishing kmalloc(__GFP_RECLAIMABLE) can help against fragmentation by grouping pages by mobility, but on tiny systems the extra memory overhead of separate set of kmalloc-rcl caches will probably be worse, and mobility grouping likely disabled anyway. Thus with CONFIG_SLUB_TINY, don't create kmalloc-rcl caches and use the regular ones. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slab.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 45efc6c553b8..ae2d19ec8467 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -336,12 +336,17 @@ enum kmalloc_cache_type { #endif #ifndef CONFIG_MEMCG_KMEM KMALLOC_CGROUP = KMALLOC_NORMAL, -#else - KMALLOC_CGROUP, #endif +#ifdef CONFIG_SLUB_TINY + KMALLOC_RECLAIM = KMALLOC_NORMAL, +#else KMALLOC_RECLAIM, +#endif #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, +#endif +#ifdef CONFIG_MEMCG_KMEM + KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES }; -- cgit From 3d97d976e5d58554570003ca5297a67142ae4e29 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 16 Nov 2022 16:48:23 +0100 Subject: mm, slab: ignore SLAB_RECLAIM_ACCOUNT with CONFIG_SLUB_TINY SLAB_RECLAIM_ACCOUNT caches allocate their slab pages with __GFP_RECLAIMABLE and can help against fragmentation by grouping pages by mobility, but on tiny systems mobility grouping is likely disabled anyway and ignoring SLAB_RECLAIM_ACCOUNT might instead lead to merging of caches that are made incompatible just by the flag. Thus with CONFIG_SLUB_TINY, make SLAB_RECLAIM_ACCOUNT ineffective. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slab.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index ae2d19ec8467..37fa41af24b6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -129,7 +129,11 @@ /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ +#ifndef CONFIG_SLUB_TINY #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) +#else +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0) +#endif #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* -- cgit From 0eb43812c0270ee3d005ff32f91f7d0a6c4943af Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Aug 2022 19:44:44 -0400 Subject: NFS: Clear the file access cache upon login POSIX typically only refreshes the user's supplementary group information upon login. Since NFS servers may often refresh their concept of the user supplementary group membership at their own cadence, it is possible for the NFS client's access cache to become stale due to the user's group membership changing on the server after the user has already logged in on the client. While it is reasonable to expect that such group membership changes are rare, and that we do not want to optimise the cache to accommodate them, it is also not unreasonable for the user to expect that if they log out and log back in again, that the staleness would clear up. Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 7931fa472561..d92fdfd2444c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -59,6 +59,7 @@ struct nfs_access_entry { kuid_t fsuid; kgid_t fsgid; struct group_info *group_info; + u64 timestamp; __u32 mask; struct rcu_head rcu_head; }; -- cgit From a66d79ee0bd5140a64b72cde588f8c83a55a1eb9 Mon Sep 17 00:00:00 2001 From: Sujuan Chen Date: Thu, 24 Nov 2022 11:18:14 +0800 Subject: net: ethernet: mtk_wed: add wcid overwritten support for wed v1 All wed versions should enable the wcid overwritten feature, since the wcid size is controlled by the wlan driver. Tested-by: Sujuan Chen Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Signed-off-by: Sujuan Chen Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 8294978f4bca..1b1ef57609f7 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -85,6 +85,9 @@ struct mtk_wed_device { int irq; u8 version; + /* used by wlan driver */ + u32 rev_id; + struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES]; struct mtk_wed_ring txfree_ring; -- cgit From 1d044ca035dc22df0d3b39e56f2881071d9118bd Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 19 Aug 2022 19:17:29 -0300 Subject: video: hyperv_fb: Avoid taking busy spinlock on panic path The Hyper-V framebuffer code registers a panic notifier in order to try updating its fbdev if the kernel crashed. The notifier callback is straightforward, but it calls the vmbus_sendpacket() routine eventually, and such function takes a spinlock for the ring buffer operations. Panic path runs in atomic context, with local interrupts and preemption disabled, and all secondary CPUs shutdown. That said, taking a spinlock might cause a lockup if a secondary CPU was disabled with such lock taken. Fix it here by checking if the ring buffer spinlock is busy on Hyper-V framebuffer panic notifier; if so, bail-out avoiding the potential lockup scenario. Cc: Andrea Parri (Microsoft) Cc: Dexuan Cui Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Michael Kelley Cc: Stephen Hemminger Cc: Tianyu Lan Cc: Wei Liu Tested-by: Fabio A M Martins Signed-off-by: Guilherme G. Piccoli Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220819221731.480795-10-gpiccoli@igalia.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 3b42264333ef..646f1da9f27e 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1341,6 +1341,8 @@ struct hv_ring_buffer_debug_info { int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info, struct hv_ring_buffer_debug_info *debug_info); +bool hv_ringbuffer_spinlock_busy(struct vmbus_channel *channel); + /* Vmbus interface */ #define vmbus_driver_register(driver) \ __vmbus_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) -- cgit From fce444fa37bef2d3382b7b44e42c6ef65040ec67 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Sat, 5 Nov 2022 11:54:01 +0000 Subject: hv: fix comment typo in vmbus_channel/low_latency morev vs. more. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/r/20221105115401.21592-1-olaf@aepfle.de Signed-off-by: Wei Liu --- include/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 646f1da9f27e..85f7c5a63aa6 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -969,7 +969,7 @@ struct vmbus_channel { * mechanism improves throughput by: * * A) Making the host more efficient - each time it wakes up, - * potentially it will process morev number of packets. The + * potentially it will process more number of packets. The * monitor latency allows a batch to build up. * B) By deferring the hypercall to signal, we will also minimize * the interrupts. -- cgit From 5a717a6e0145481ea4c188d8be2d6217cefed270 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 16 Oct 2022 11:46:56 -0400 Subject: SUNRPC: Remove unused svc_rqst::rq_lock field Clean up after commit 22700f3c6df5 ("SUNRPC: Improve ordering of transport processing"). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 88de45491376..0edd837d401d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -311,7 +311,6 @@ struct svc_rqst { struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ struct svc_cacherep * rq_cacherep; /* cache info */ struct task_struct *rq_task; /* service thread */ - spinlock_t rq_lock; /* per-request lock */ struct net *rq_bc_net; /* pointer to backchannel's * net namespace */ -- cgit From 923d011febb4e2fb338036bb0ee6a0a7f9b10da1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 27 Nov 2022 22:52:10 +0100 Subject: gpio: Do not include when not really needed. is included only for using container_of(). Include instead, it is much lighter. Signed-off-by: Christophe JAILLET Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/of_gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index a5166eb93437..6db627257a7b 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -34,7 +34,7 @@ enum of_gpio_flags { #ifdef CONFIG_OF_GPIO -#include +#include /* * OF GPIO chip for memory mapped banks -- cgit From 3ca9d84e722e8044c09e80992aa7b15bd904d3ce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 23 Nov 2022 19:40:01 -0500 Subject: KVM: always declare prototype for kvm_arch_irqchip_in_kernel Architecture code might want to use it even if CONFIG_HAVE_KVM_IRQ_ROUTING is false; for example PPC XICS has KVM_IRQ_LINE and wants to use kvm_arch_irqchip_in_kernel from there, but it does not have KVM_SET_GSI_ROUTING so the prototype was not provided. Fixes: d663b8a28598 ("KVM: replace direct irq.h inclusion") Reported-by: kernel test robot Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6f0f389f5f9c..b8d12356f015 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -663,9 +663,9 @@ struct kvm_irq_routing_table { */ struct hlist_head map[]; }; +#endif bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); -#endif #ifndef KVM_INTERNAL_MEM_SLOTS #define KVM_INTERNAL_MEM_SLOTS 0 -- cgit From d7b64041164ca177170191d2ad775da074ab2926 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: iomap: write iomap validity checks A recent multithreaded write data corruption has been uncovered in the iomap write code. The core of the problem is partial folio writes can be flushed to disk while a new racing write can map it and fill the rest of the page: writeback new write allocate blocks blocks are unwritten submit IO ..... map blocks iomap indicates UNWRITTEN range loop { lock folio copyin data ..... IO completes runs unwritten extent conv blocks are marked written get next folio } Now add memory pressure such that memory reclaim evicts the partially written folio that has already been written to disk. When the new write finally gets to the last partial page of the new write, it does not find it in cache, so it instantiates a new page, sees the iomap is unwritten, and zeros the part of the page that it does not have data from. This overwrites the data on disk that was originally written. The full description of the corruption mechanism can be found here: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ To solve this problem, we need to check whether the iomap is still valid after we lock each folio during the write. We have to do it after we lock the page so that we don't end up with state changes occurring while we wait for the folio to be locked. Hence we need a mechanism to be able to check that the cached iomap is still valid (similar to what we already do in buffered writeback), and we need a way for ->begin_write to back out and tell the high level iomap iterator that we need to remap the remaining write range. The iomap needs to grow some storage for the validity cookie that the filesystem provides to travel with the iomap. XFS, in particular, also needs to know some more information about what the iomap maps (attribute extents rather than file data extents) to for the validity cookie to cover all the types of iomaps we might need to validate. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/iomap.h | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0698c4b8ce0e..0983dfc9a203 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -49,26 +49,35 @@ struct vm_fault; * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. */ -#define IOMAP_F_NEW 0x01 -#define IOMAP_F_DIRTY 0x02 -#define IOMAP_F_SHARED 0x04 -#define IOMAP_F_MERGED 0x08 -#define IOMAP_F_BUFFER_HEAD 0x10 -#define IOMAP_F_ZONE_APPEND 0x20 +#define IOMAP_F_NEW (1U << 0) +#define IOMAP_F_DIRTY (1U << 1) +#define IOMAP_F_SHARED (1U << 2) +#define IOMAP_F_MERGED (1U << 3) +#define IOMAP_F_BUFFER_HEAD (1U << 4) +#define IOMAP_F_ZONE_APPEND (1U << 5) +#define IOMAP_F_XATTR (1U << 6) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. + * + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. */ -#define IOMAP_F_SIZE_CHANGED 0x100 +#define IOMAP_F_SIZE_CHANGED (1U << 8) +#define IOMAP_F_STALE (1U << 9) /* * Flags from 0x1000 up are for file system specific usage: */ -#define IOMAP_F_PRIVATE 0x1000 +#define IOMAP_F_PRIVATE (1U << 12) /* @@ -89,6 +98,7 @@ struct iomap { void *inline_data; void *private; /* filesystem private */ const struct iomap_page_ops *page_ops; + u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) @@ -128,6 +138,23 @@ struct iomap_page_ops { int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* -- cgit From 27ef17849779edd5600aa27d1a246ad424761971 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Mon, 28 Nov 2022 19:29:54 +0900 Subject: usb: add usb_set_intfdata() documentation USB drivers do not need to call usb_set_intfdata(intf, NULL) in their usb_driver::disconnect callback because the core already does it in [1]. However, this fact is widely unknown, c.f.: $ git grep "usb_set_intfdata(.*NULL)" | wc -l 215 Especially, setting the interface to NULL before all action completed can result in a NULL pointer dereference. Not calling usb_set_intfdata() at all in disconnect() is the safest method. Add documentation to usb_set_intfdata() to clarify this point. Also remove the call in usb-skeletion's disconnect() not to confuse the new comers. [1] function usb_unbind_interface() from drivers/usb/core/driver.c Link: https://elixir.bootlin.com/linux/v6.0/source/drivers/usb/core/driver.c#L497 Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20221128102954.3615579-1-mailhol.vincent@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 9ff1ad4dfad1..d4afeeec1e1a 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -265,6 +265,18 @@ static inline void *usb_get_intfdata(struct usb_interface *intf) return dev_get_drvdata(&intf->dev); } +/** + * usb_set_intfdata() - associate driver-specific data with the interface + * @intf: the usb interface + * @data: pointer to the device priv structure or %NULL + * + * Drivers should use this function in their probe() to associate their + * driver-specific data with the usb interface. + * + * When disconnecting, the core will take care of setting @intf back to %NULL, + * so no actions are needed on the driver side. The interface should not be set + * to %NULL before all actions completed (e.g. no outsanding URB remaining). + */ static inline void usb_set_intfdata(struct usb_interface *intf, void *data) { dev_set_drvdata(&intf->dev, data); -- cgit From 032399819dd5f135e6ffe446c8e97ab54eec3464 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 22 Nov 2022 22:05:36 +0000 Subject: usb: typec: Add partner PD object wrapper Some port drivers may want to set a Type-C partner as a parent for a USB Power Delivery object, but the Type-C partner struct isn't exposed outside of the Type-C class driver. Add a wrapper to usb_power_delivery_register() which sets the provided Type-C partner as a parent to the USB PD object. This helps to avoid exposing the Type-C partner's device struct unnecessarily. Cc: Benson Leung Suggested-by: Heikki Krogerus Reviewed-by: Heikki Krogerus Signed-off-by: Prashant Malani Link: https://lore.kernel.org/r/20221122220538.2991775-2-pmalani@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index 7751bedcae5d..8fa781207970 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -23,6 +23,7 @@ struct fwnode_handle; struct device; struct usb_power_delivery; +struct usb_power_delivery_desc; enum typec_port_type { TYPEC_PORT_SRC, @@ -327,6 +328,9 @@ void typec_partner_set_svdm_version(struct typec_partner *partner, enum usb_pd_svdm_ver svdm_version); int typec_get_negotiated_svdm_version(struct typec_port *port); +struct usb_power_delivery *typec_partner_usb_power_delivery_register(struct typec_partner *partner, + struct usb_power_delivery_desc *desc); + int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_delivery *pd); int typec_partner_set_usb_power_delivery(struct typec_partner *partner, struct usb_power_delivery *pd); -- cgit From a9efc04cfd05690e91279f41c2325c46335c43ef Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Nov 2022 16:48:58 +0200 Subject: i915: Move list_count() to list.h for broader use Some of the existing users, and definitely will be new ones, want to count existing nodes in the list. Provide a generic API for that by moving code from i915 to list.h. Reviewed-by: Lucas De Marchi Acked-by: Jani Nikula Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221123144901.40493-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/list.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 61762054b4be..632a298c7018 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -655,6 +655,21 @@ static inline void list_splice_tail_init(struct list_head *list, !list_is_head(pos, (head)); \ pos = n, n = pos->prev) +/** + * list_count - count nodes in the list + * @head: the head for your list. + */ +static inline size_t list_count(struct list_head *head) +{ + struct list_head *pos; + size_t count = 0; + + list_for_each(pos, head) + count++; + + return count; +} + /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor -- cgit From b0284cd29a957e62d60c2886fd663be93c56f9c0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Nov 2022 18:10:34 -0700 Subject: mm: Do not enable PG_arch_2 for all 64-bit architectures Commit 4beba9486abd ("mm: Add PG_arch_2 page flag") introduced a new page flag for all 64-bit architectures. However, even if an architecture is 64-bit, it may still have limited spare bits in the 'flags' member of 'struct page'. This may happen if an architecture enables SPARSEMEM without SPARSEMEM_VMEMMAP as is the case with the newly added loongarch. This architecture port needs 19 more bits for the sparsemem section information and, while it is currently fine with PG_arch_2, adding any more PG_arch_* flags will trigger build-time warnings. Add a new CONFIG_ARCH_USES_PG_ARCH_X option which can be selected by architectures that need more PG_arch_* flags beyond PG_arch_1. Select it on arm64. Signed-off-by: Catalin Marinas [pcc@google.com: fix build with CONFIG_ARM64_MTE disabled] Signed-off-by: Peter Collingbourne Reported-by: kernel test robot Cc: Andrew Morton Cc: Steven Price Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-2-pcc@google.com --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..5dc7977edf9d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -132,7 +132,7 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, #endif #ifdef CONFIG_KASAN_HW_TAGS -- cgit From ef6458b1b6ca3fdb991ce4182e981a88d4c58c0f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 3 Nov 2022 18:10:37 -0700 Subject: mm: Add PG_arch_3 page flag As with PG_arch_2, this flag is only allowed on 64-bit architectures due to the shortage of bits available. It will be used by the arm64 MTE code in subsequent patches. Signed-off-by: Peter Collingbourne Cc: Will Deacon Cc: Marc Zyngier Cc: Steven Price [catalin.marinas@arm.com: added flag preserving in __split_huge_page_tail()] Signed-off-by: Catalin Marinas Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-5-pcc@google.com --- include/linux/kernel-page-flags.h | 1 + include/linux/page-flags.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index eee1877a354e..859f4b0c1b2b 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -18,5 +18,6 @@ #define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 +#define KPF_ARCH_3 42 #endif /* LINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5dc7977edf9d..c50ce2812f17 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -134,6 +134,7 @@ enum pageflags { #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, + PG_arch_3, #endif #ifdef CONFIG_KASAN_HW_TAGS PG_skip_kasan_poison, -- cgit From 4c47867bc789bdc722f3bb760355c2c246fbe9af Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 24 Nov 2022 12:15:54 +0100 Subject: of: net: export of_get_mac_address_nvmem() Export of_get_mac_addr_nvmem() and rename it to of_get_mac_address_nvmem() in order to fit the convention followed by the existing exported helpers of the same kind. This way, OF compatible drivers using eg. fwnode_get_mac_address() can do a direct call to it instead of calling of_get_mac_address() just for the nvmem step, avoiding to repeat an expensive DT lookup which has already been done once. Eventually, fwnode_get_mac_address() should probably be updated to perform the nvmem lookup directly, but as of today, nvmem cells seem not to be supported by ACPI yet which would defeat this kind of extension. Suggested-by: Marcin Wojtas Signed-off-by: Miquel Raynal Signed-off-by: Paolo Abeni --- include/linux/of_net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 0484b613ca64..d88715a0b3a5 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -14,6 +14,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern int of_get_mac_address(struct device_node *np, u8 *mac); +extern int of_get_mac_address_nvmem(struct device_node *np, u8 *mac); int of_get_ethdev_address(struct device_node *np, struct net_device *dev); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else @@ -28,6 +29,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac) return -ENODEV; } +static inline int of_get_mac_address_nvmem(struct device_node *np, u8 *mac) +{ + return -ENODEV; +} + static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev) { return -ENODEV; -- cgit From f78cd9c783e09a0fe454b0fc8b39c22025d7869e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Nov 2022 16:22:53 +0100 Subject: net: ethernet: mtk_wed: update mtk_wed_stop Update mtk_wed_stop routine and rename old mtk_wed_stop() to mtk_wed_deinit(). This is a preliminary patch to add Wireless Ethernet Dispatcher reset support. Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 1b1ef57609f7..c43510e541df 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -234,6 +234,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ (_dev)->ops->msg_update(_dev, _id, _msg, _len) +#define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev) +#define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -250,6 +252,8 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV +#define mtk_wed_device_stop(_dev) do {} while (0) +#define mtk_wed_device_dma_reset(_dev) do {} while (0) #endif #endif -- cgit From 23dca7a90017ff2512c501f7da4c7ca7a95c2d6e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Nov 2022 16:22:55 +0100 Subject: net: ethernet: mtk_wed: add reset to tx_ring_setup callback Introduce reset parameter to mtk_wed_tx_ring_setup signature. This is a preliminary patch to add Wireless Ethernet Dispatcher reset support. Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index c43510e541df..beb190449704 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -158,7 +158,7 @@ struct mtk_wed_device { struct mtk_wed_ops { int (*attach)(struct mtk_wed_device *dev); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, - void __iomem *regs); + void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs); int (*txfree_ring_setup)(struct mtk_wed_device *dev, @@ -216,8 +216,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) #define mtk_wed_device_active(_dev) !!(_dev)->ops #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) #define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask) -#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \ - (_dev)->ops->tx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) \ + (_dev)->ops->tx_ring_setup(_dev, _ring, _regs, _reset) #define mtk_wed_device_txfree_ring_setup(_dev, _regs) \ (_dev)->ops->txfree_ring_setup(_dev, _regs) #define mtk_wed_device_reg_read(_dev, _reg) \ @@ -243,7 +243,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) } #define mtk_wed_device_detach(_dev) do {} while (0) #define mtk_wed_device_start(_dev, _mask) do {} while (0) -#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV #define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV #define mtk_wed_device_reg_read(_dev, _reg) 0 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) -- cgit From dda3bbbb26c823cd54d5bf211df7db12147c9392 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 29 Nov 2022 01:30:05 -0800 Subject: Revert "net/mlx5e: MACsec, remove replay window size limitation in offload path" This reverts commit c0071be0e16c461680d87b763ba1ee5e46548fde. The cited commit removed the validity checks which initialized the window_sz and never removed the use of the now uninitialized variable, so now we are left with wrong value in the window size and the following clang warning: [-Wuninitialized] drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c:232:45: warning: variable 'window_sz' is uninitialized when used here MLX5_SET(macsec_aso, aso_ctx, window_size, window_sz); Revet at this time to address the clang issue due to lack of time to test the proper solution. Fixes: c0071be0e16c ("net/mlx5e: MACsec, remove replay window size limitation in offload path") Signed-off-by: Saeed Mahameed Reported-by: Jacob Keller Link: https://lore.kernel.org/r/20221129093006.378840-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 981fc7dfa408..5a4e914e2a6f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11611,6 +11611,13 @@ enum { MLX5_MACSEC_ASO_REPLAY_PROTECTION = 0x1, }; +enum { + MLX5_MACSEC_ASO_REPLAY_WIN_32BIT = 0x0, + MLX5_MACSEC_ASO_REPLAY_WIN_64BIT = 0x1, + MLX5_MACSEC_ASO_REPLAY_WIN_128BIT = 0x2, + MLX5_MACSEC_ASO_REPLAY_WIN_256BIT = 0x3, +}; + #define MLX5_MACSEC_ASO_INC_SN 0x2 #define MLX5_MACSEC_ASO_REG_C_4_5 0x2 -- cgit From 4989764d8ed3d3d1024e4e831ff2affc40ee01d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:24 -0400 Subject: iommu: Add IOMMU_CAP_ENFORCE_CACHE_COHERENCY This queries if a domain linked to a device should expect to support enforce_cache_coherency() so iommufd can negotiate the rules for when a domain should be shared or not. For iommufd a device that declares IOMMU_CAP_ENFORCE_CACHE_COHERENCY will not be attached to a domain that does not support it. Link: https://lore.kernel.org/r/1-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 68d7d304cdb7..a09fd32d8cc2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -124,6 +124,11 @@ enum iommu_cap { IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for DMA protection and we should too */ + /* + * Per-device flag indicating if enforce_cache_coherency() will work on + * this device. + */ + IOMMU_CAP_ENFORCE_CACHE_COHERENCY, }; /* These are the possible reserved region types */ -- cgit From 89395ccedbc153fecbc29342fbb94a6dfadf24cd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 29 Nov 2022 16:29:25 -0400 Subject: iommu: Add device-centric DMA ownership interfaces These complement the group interfaces used by VFIO and are for use by iommufd. The main difference is that multiple devices in the same group can all share the ownership by passing the same ownership pointer. Move the common code into shared functions. Link: https://lore.kernel.org/r/2-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a09fd32d8cc2..1690c334e516 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -707,6 +707,9 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +int iommu_device_claim_dma_owner(struct device *dev, void *owner); +void iommu_device_release_dma_owner(struct device *dev); + struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, @@ -1064,6 +1067,15 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) return false; } +static inline void iommu_device_release_dma_owner(struct device *dev) +{ +} + +static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner) +{ + return -ENODEV; +} + static inline struct iommu_domain * iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) { -- cgit From 5fe937862c8426f24cd1dcbf7c22fb1a31069b4f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:26 -0400 Subject: interval-tree: Add a utility to iterate over spans in an interval tree The span iterator travels over the indexes of the interval_tree, not the nodes, and classifies spans of indexes as either 'used' or 'hole'. 'used' spans are fully covered by nodes in the tree and 'hole' spans have no node intersecting the span. This is done greedily such that spans are maximally sized and every iteration step switches between used/hole. As an example a trivial allocator can be written as: for (interval_tree_span_iter_first(&span, itree, 0, ULONG_MAX); !interval_tree_span_iter_done(&span); interval_tree_span_iter_next(&span)) if (span.is_hole && span.last_hole - span.start_hole >= allocation_size - 1) return span.start_hole; With all the tricky boundary conditions handled by the library code. The following iommufd patches have several algorithms for its overlapping node interval trees that are significantly simplified with this kind of iteration primitive. As it seems generally useful, put it into lib/. Link: https://lore.kernel.org/r/3-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/interval_tree.h | 58 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 288c26f50732..2b8026a39906 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -27,4 +27,62 @@ extern struct interval_tree_node * interval_tree_iter_next(struct interval_tree_node *node, unsigned long start, unsigned long last); +/** + * struct interval_tree_span_iter - Find used and unused spans. + * @start_hole: Start of an interval for a hole when is_hole == 1 + * @last_hole: Inclusive end of an interval for a hole when is_hole == 1 + * @start_used: Start of a used interval when is_hole == 0 + * @last_used: Inclusive end of a used interval when is_hole == 0 + * @is_hole: 0 == used, 1 == is_hole, -1 == done iteration + * + * This iterator travels over spans in an interval tree. It does not return + * nodes but classifies each span as either a hole, where no nodes intersect, or + * a used, which is fully covered by nodes. Each iteration step toggles between + * hole and used until the entire range is covered. The returned spans always + * fully cover the requested range. + * + * The iterator is greedy, it always returns the largest hole or used possible, + * consolidating all consecutive nodes. + * + * Use interval_tree_span_iter_done() to detect end of iteration. + */ +struct interval_tree_span_iter { + /* private: not for use by the caller */ + struct interval_tree_node *nodes[2]; + unsigned long first_index; + unsigned long last_index; + + /* public: */ + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + int is_hole; +}; + +void interval_tree_span_iter_first(struct interval_tree_span_iter *state, + struct rb_root_cached *itree, + unsigned long first_index, + unsigned long last_index); +void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, + struct rb_root_cached *itree, + unsigned long new_index); +void interval_tree_span_iter_next(struct interval_tree_span_iter *state); + +static inline bool +interval_tree_span_iter_done(struct interval_tree_span_iter *state) +{ + return state->is_hole == -1; +} + +#define interval_tree_for_each_span(span, itree, first_index, last_index) \ + for (interval_tree_span_iter_first(span, itree, \ + first_index, last_index); \ + !interval_tree_span_iter_done(span); \ + interval_tree_span_iter_next(span)) + #endif /* _LINUX_INTERVAL_TREE_H */ -- cgit From 3cb278e73be58bfb780ecd55129296d2f74c1fb7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 16 Oct 2022 16:22:54 +0000 Subject: rcu: Make call_rcu() lazy to save power Implement timer-based RCU callback batching (also known as lazy callbacks). With this we save about 5-10% of power consumed due to RCU requests that happen when system is lightly loaded or idle. By default, all async callbacks (queued via call_rcu) are marked lazy. An alternate API call_rcu_hurry() is provided for the few users, for example synchronize_rcu(), that need the old behavior. The batch is flushed whenever a certain amount of time has passed, or the batch on a particular CPU grows too big. Also memory pressure will flush it in a future patch. To handle several corner cases automagically (such as rcu_barrier() and hotplug), we re-use bypass lists which were originally introduced to address lock contention, to handle lazy CBs as well. The bypass list length has the lazy CB length included in it. A separate lazy CB length counter is also introduced to keep track of the number of lazy CBs. [ paulmck: Fix formatting of inline call_rcu_lazy() definition. ] [ paulmck: Apply Zqiang feedback. ] [ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ] Suggested-by: Paul McKenney Acked-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 08605ce7379d..611c11383d23 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_RCU_LAZY +void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); +#else +static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) +{ + call_rcu(head, func); +} +#endif + /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active; -- cgit From 0cd7e350abc40eed5d3b60292dc102f700c88388 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2022 13:53:57 -0800 Subject: rcu: Make SRCU mandatory Kernels configured with CONFIG_PRINTK=n and CONFIG_SRCU=n get build failures. This causes trouble for deep embedded systems. But given that there are more than 25 instances of "select SRCU" in the kernel, it is hard to believe that there are many kernels running in production without SRCU. This commit therefore makes SRCU mandatory. The SRCU Kconfig option remains for backwards compatibility, and will be removed when it is no longer used. [ paulmck: Update per kernel test robot feedback. ] Reported-by: John Ogness Reported-by: Petr Mladek Signed-off-by: Paul E. McKenney Cc: Acked-by: Randy Dunlap # build-tested Reviewed-by: John Ogness --- include/linux/rcutiny.h | 4 ---- include/linux/srcu.h | 9 +-------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 768196a5f39d..d40b21ec7e0d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -154,11 +154,7 @@ static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) return false; } static inline void rcu_preempt_deferred_qs(struct task_struct *t) { } -#ifdef CONFIG_SRCU void rcu_scheduler_starting(void); -#else /* #ifndef CONFIG_SRCU */ -static inline void rcu_scheduler_starting(void) { } -#endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f0814ffca34b..9b9d0bbf1d3c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,11 +47,8 @@ int init_srcu_struct(struct srcu_struct *ssp); #include #elif defined(CONFIG_TREE_SRCU) #include -#elif defined(CONFIG_SRCU) -#error "Unknown SRCU implementation specified to kernel configuration" #else -/* Dummy definition for things like notifiers. Actual use gets link error. */ -struct srcu_struct { }; +#error "Unknown SRCU implementation specified to kernel configuration" #endif void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, @@ -78,11 +75,7 @@ static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) } #endif /* CONFIG_NEED_SRCU_NMI_SAFE */ -#ifdef CONFIG_SRCU void srcu_init(void); -#else /* #ifdef CONFIG_SRCU */ -static inline void srcu_init(void) { } -#endif /* #else #ifdef CONFIG_SRCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit From 12eb0f84a601cec8920b56d7ffd4182a6bfd6521 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 29 Sep 2022 13:45:08 +0200 Subject: net/mlx5: Remove unused ctx variables Remove mlx5_priv.ctx_list and ctx_lock which are no longer used after commit 601c10c89cbb ("net/mlx5: Delete custom device management logic"). Signed-off-by: Petr Pavlu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 06cbad166225..d476255c9a3f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -606,8 +606,6 @@ struct mlx5_priv { struct list_head pgdir_list; /* end: alloc staff */ - struct list_head ctx_list; - spinlock_t ctx_lock; struct mlx5_adev **adev; int adev_idx; int sw_vhca_id; -- cgit From b146658f2ed90768b769222ca418617274242b32 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:18:22 +0200 Subject: net/mlx5e: Add padding when needed in UMR WQEs Per the device spec, MTTs/KLMs list in a UMR WQE must be aligned to 64B. Per our SW design, the MTT/KLMs list would need alignment only if it's too small, for example on PPC when PAGE_SIZE is 64KB, and only 4 pages are needed to cover a MPWQE of size 256KB. Padding, if needed, is taken into account when calculating the UMR WQE fields (ds_cnt and xlt_octowords), however no entries are provided, instead garbage is passed. No real harm though, as these parts act as gaps between the RX MPWQEs and not used by any of them. Hence, in practice, device does not try to write any incoming packet to them. Still, prefer providing clean padding marking the end of the list, and do not map garbage into the RQ memory region. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index eb3fac30488b..97275965f156 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -294,6 +294,7 @@ enum { #define MLX5_UMR_MTT_ALIGNMENT 0x40 #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT +#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From 683d78a0d46235edfd3c50ddbc4af1065b422670 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:30:23 +0200 Subject: net/mlx5: Remove unused UMR MTT definitions Defines MLX5_UMR_MTT_MASK and MLX5_UMR_MTT_MIN_CHUNK_SIZE are not in use. Remove them. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 97275965f156..ecf840e2989b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -292,8 +292,6 @@ enum { #define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_MTT_ALIGNMENT 0x40 -#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) -#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From 02648b4b09d506bd4df2e17bf109c229fc728640 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:44:57 +0200 Subject: net/mlx5: Generalize name of UMR alignment definition Per the device spec, MLX5_UMR_MTT_ALIGNMENT is good not only for UMR MTT entries, but for all other entries as well, like KLMs and KSMs. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ecf840e2989b..a02f779f5c5b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -291,8 +291,8 @@ enum { }; #define MLX5_UMR_KLM_ALIGNMENT 4 -#define MLX5_UMR_MTT_ALIGNMENT 0x40 -#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) +#define MLX5_UMR_FLEX_ALIGNMENT 0x40 +#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From daab2e9c54a52dea8dbd1af516e6f986eeed2556 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:24:02 +0200 Subject: net/mlx5: Use generic definition for UMR KLM alignment MLX5_UMR_KLM_ALIGNMENT is in units of number of entries, while MLX5_UMR_MTT_ALIGNMENT (generalized and renamed to MLX5_UMR_FLEX_ALIGNMENT) is in byte units. This is misleading and confusing. Replace this KLM definition with one based on the generic definition. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index a02f779f5c5b..5fe5d198b57a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -290,9 +290,9 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; -#define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) +#define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From ab1ddef98a715eddb65309ffa83267e4e84a571e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 14 Nov 2022 08:33:09 -0500 Subject: filelock: new helper: vfs_inode_has_locks Ceph has a need to know whether a particular inode has any locks set on it. It's currently tracking that by a num_locks field in its filp->private_data, but that's problematic as it tries to decrement this field when releasing locks and that can race with the file being torn down. Add a new vfs_inode_has_locks helper that just returns whether any locks are currently held on the inode. Reviewed-by: Xiubo Li Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..6165c6245347 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1170,6 +1170,7 @@ extern int locks_delete_block(struct file_lock *); extern int vfs_test_lock(struct file *, struct file_lock *); extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); +bool vfs_inode_has_locks(struct inode *inode); extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); extern void lease_get_mtime(struct inode *, struct timespec64 *time); @@ -1284,6 +1285,11 @@ static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) return 0; } +static inline bool vfs_inode_has_locks(struct inode *inode) +{ + return false; +} + static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; -- cgit From 401a8b8fd5acd51582b15238d72a8d0edd580e9f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:02:30 -0500 Subject: filelock: add a new locks_inode_context accessor function There are a number of places in the kernel that are accessing the inode->i_flctx field without smp_load_acquire. This is required to ensure that the caller doesn't see a partially-initialized structure. Add a new accessor function for it to make this clear and convert all of the relevant accesses in locks.c to use it. Also, convert locks_free_lock_context to use the helper as well instead of just doing a "bare" assignment. Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- include/linux/fs.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6165c6245347..fd0a79511fd8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1187,6 +1187,13 @@ extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); extern bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return smp_load_acquire(&inode->i_flctx); +} + #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1332,6 +1339,13 @@ static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, { return false; } + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return NULL; +} + #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) -- cgit From 51daa42d6b86efa366320b99e7bbe29a490ed348 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 30 Nov 2022 12:02:13 +0100 Subject: Revert "i915: Move list_count() to list.h for broader use" This reverts commit a9efc04cfd05690e91279f41c2325c46335c43ef as it breaks the build. Link: https://lore.kernel.org/r/20221130131854.35b58b16@canb.auug.org.au Link: https://lore.kernel.org/r/202211301628.iwMjPVMp-lkp@intel.com Cc: Lucas De Marchi Cc: Jani Nikula Cc: Andy Shevchenko Reported-by: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- include/linux/list.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 632a298c7018..61762054b4be 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -655,21 +655,6 @@ static inline void list_splice_tail_init(struct list_head *list, !list_is_head(pos, (head)); \ pos = n, n = pos->prev) -/** - * list_count - count nodes in the list - * @head: the head for your list. - */ -static inline size_t list_count(struct list_head *head) -{ - struct list_head *pos; - size_t count = 0; - - list_for_each(pos, head) - count++; - - return count; -} - /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor -- cgit From c3bbacd61baace2f4fbab17012c3d149df2d50f1 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Wed, 30 Nov 2022 11:19:42 +0200 Subject: xhci: disable U3 suspended ports in S4 hibernate poweroff_late stage Disable U3 suspended ports in hibernate S4 poweroff_late for systems with XHCI_RESET_TO_DEFAULT quirk, if wakeup is not enabled. This reduces the number of self-powered usb devices from surviving in U3 suspended state into next reboot. Bootloader/firmware on these systems can't handle usb ports in U3, and will timeout, causing extra delay during reboot/restore from S4. Add pci_poweroff_late() callback to struct usb_hcd to get this done at the correct stage in hibernate. Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20221130091944.2171610-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 78cd566ee238..b51c07111729 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -269,6 +269,9 @@ struct hc_driver { /* called after entering D0 (etc), before resuming the hub */ int (*pci_resume)(struct usb_hcd *hcd, bool hibernated); + /* called just before hibernate final D3 state, allows host to poweroff parts */ + int (*pci_poweroff_late)(struct usb_hcd *hcd, bool do_wakeup); + /* cleanly make HCD stop writing memory and doing I/O */ void (*stop) (struct usb_hcd *hcd); -- cgit From c62256dda37133a48d56cecc15e4a4d527d4cc46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Nov 2022 08:25:46 -0700 Subject: Revert "blk-cgroup: Flush stats at blkgs destruction path" This reverts commit dae590a6c96c799434e0ff8156ef29b88c257e60. We've had a few reports on this causing a crash at boot time, because of a reference issue. While this problem seemginly did exist before the patch and needs solving separately, this patch makes it a lot easier to trigger. Link: https://lore.kernel.org/linux-block/CA+QYu4oxiRKC6hJ7F27whXy-PRBx=Tvb+-7TQTONN8qTtV3aDA@mail.gmail.com/ Link: https://lore.kernel.org/linux-block/69af7ccb-6901-c84c-0e95-5682ccfb750c@acm.org/ Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6c4e66b3fa84..528bd44b59e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,7 +766,6 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); -void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. -- cgit From aba3caef58626f09b629085440eec5dd1368669a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:22 +0000 Subject: KVM: Shorten gfn_to_pfn_cache function names Formalize "gpc" as the acronym and use it in function names. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b8d12356f015..8f874a964313 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1288,16 +1288,15 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); * -EFAULT for an untranslatable guest physical address. * * This primes a gfn_to_pfn_cache and links it into the @kvm's list for - * invalidations to be processed. Callers are required to use - * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before - * accessing the target page. + * invalidations to be processed. Callers are required to use kvm_gpc_check() + * to ensure that the cache is valid before accessing the target page. */ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, gpa_t gpa, unsigned long len); /** - * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache. + * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1314,11 +1313,11 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ -bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - gpa_t gpa, unsigned long len); +bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len); /** - * kvm_gfn_to_pfn_cache_refresh - update a previously initialized cache. + * kvm_gpc_refresh - update a previously initialized cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1335,11 +1334,11 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ -int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - gpa_t gpa, unsigned long len); +int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len); /** - * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache. + * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1348,7 +1347,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, * but at least the mapping from GPA to userspace HVA will remain cached * and can be reused on a subsequent refresh. */ -void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); +void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. -- cgit From 7500194a630b11236761df35fef300009d7d3f6f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:59 +0000 Subject: io_uring: reshuffle issue_flags Reshuffle issue flags to keep normal flags separate from the uring_cmd ctx-setup like flags. Shift the second type to the second byte so it's easier to add new ones in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d6e4696c883943082d248716f4cd568f37b17a74.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 0ded9e271523..29e519752da4 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -9,16 +9,15 @@ enum io_uring_cmd_flags { IO_URING_F_COMPLETE_DEFER = 1, IO_URING_F_UNLOCKED = 2, + /* the request is executed from poll, it should not be freed */ + IO_URING_F_MULTISHOT = 4, /* int's last bit, sign checks are usually faster than a bit test */ IO_URING_F_NONBLOCK = INT_MIN, /* ctx state flags, for URING_CMD */ - IO_URING_F_SQE128 = 4, - IO_URING_F_CQE32 = 8, - IO_URING_F_IOPOLL = 16, - - /* the request is executed from poll, it should not be freed */ - IO_URING_F_MULTISHOT = 32, + IO_URING_F_SQE128 = (1 << 8), + IO_URING_F_CQE32 = (1 << 9), + IO_URING_F_IOPOLL = (1 << 10), }; struct io_uring_cmd { -- cgit From e9374524950512a1769f610a868fcdf89ea59b8e Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 22 Nov 2022 20:30:57 +0100 Subject: netfilter: ipset: Add support for new bitmask parameter Add a new parameter to complement the existing 'netmask' option. The main difference between netmask and bitmask is that bitmask takes any arbitrary ip address as input, it does not have to be a valid netmask. The name of the new parameter is 'bitmask'. This lets us mask out arbitrary bits in the ip address, for example: ipset create set1 hash:ip bitmask 255.128.255.0 ipset create set2 hash:ip,port family inet6 bitmask ffff::ff80 Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d5..ab934ad951a8 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, *skbinfo = ext->skbinfo; } +static inline void +nf_inet_addr_mask_inplace(union nf_inet_addr *a1, + const union nf_inet_addr *mask) +{ + a1->all[0] &= mask->all[0]; + a1->all[1] &= mask->all[1]; + a1->all[2] &= mask->all[2]; + a1->all[3] &= mask->all[3]; +} + #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } -- cgit From 2bd85221a625b316114bafaab527770b607095d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:36 +0100 Subject: block: untangle request_queue refcounting from sysfs The kobject embedded into the request_queue is used for the queue directory in sysfs, but that is a child of the gendisks directory and is intimately tied to it. Move this kobject to the gendisk and use a refcount_t in the request_queue for the actual request_queue refcounting that is completely unrelated to the device model. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 516e45246868..469299ea0660 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -155,6 +155,7 @@ struct gendisk { unsigned open_partitions; /* number of open partitions */ struct backing_dev_info *bdi; + struct kobject queue_kobj; /* the queue/ directory */ struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; @@ -430,10 +431,7 @@ struct request_queue { struct gendisk *disk; - /* - * queue kobject - */ - struct kobject kobj; + refcount_t refs; /* * mq queue kobject -- cgit From 8c82a0b3ba1a411b84af5d43a4cc5994efa897ec Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:24 +0000 Subject: KVM: Store immutable gfn_to_pfn_cache properties Move the assignment of immutable properties @kvm, @vcpu, and @usage to the initializer. Make _activate() and _deactivate() use stored values. Note, @len is also effectively immutable for most cases, but not in the case of the Xen runstate cache, which may be split across two pages and the length of the first segment will depend on its address. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj [sean: handle @len in a separate patch] Signed-off-by: Sean Christopherson [dwmw2: acknowledge that @len can actually change for some use cases] Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 37 ++++++++++++++++++------------------- include/linux/kvm_types.h | 1 + 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f874a964313..73ded328f9dc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1260,18 +1260,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); * kvm_gpc_init - initialize gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. - * - * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must - * be zero-allocated (or zeroed by the caller before init). - */ -void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); - -/** - * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest - * physical address. - * * @kvm: pointer to kvm instance. - * @gpc: struct gfn_to_pfn_cache object. * @vcpu: vCPU to be used for marking pages dirty and to be woken on * invalidation. * @usage: indicates if the resulting host physical PFN is used while @@ -1280,20 +1269,31 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); * changes!---will also force @vcpu to exit the guest and * refresh the cache); and/or if the PFN used directly * by KVM (and thus needs a kernel virtual mapping). + * + * This sets up a gfn_to_pfn_cache by initializing locks and assigning the + * immutable attributes. Note, the cache must be zero-allocated (or zeroed by + * the caller before init). + */ +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage); + +/** + * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest + * physical address. + * + * @gpc: struct gfn_to_pfn_cache object. * @gpa: guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. - * -EFAULT for an untranslatable guest physical address. + * -EFAULT for an untranslatable guest physical address. * - * This primes a gfn_to_pfn_cache and links it into the @kvm's list for + * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for * invalidations to be processed. Callers are required to use kvm_gpc_check() * to ensure that the cache is valid before accessing the target page. */ -int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len); +int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. @@ -1352,13 +1352,12 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * - * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. * - * This removes a cache from the @kvm's list to be processed on MMU notifier + * This removes a cache from the VM's list to be processed on MMU notifier * invocation. */ -void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); +void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 3ca3db020e0e..76de36e56cdf 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -67,6 +67,7 @@ struct gfn_to_pfn_cache { gpa_t gpa; unsigned long uhva; struct kvm_memory_slot *memslot; + struct kvm *kvm; struct kvm_vcpu *vcpu; struct list_head list; rwlock_t lock; -- cgit From e308c24a358d1e79951b16c387cbc6c6593639a5 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:26 +0000 Subject: KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_check() Make kvm_gpc_check() use kvm instance cached in gfn_to_pfn_cache. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 73ded328f9dc..befc8114ed0d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1298,7 +1298,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * - * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. * @gpa: current guest physical address to map. * @len: sanity check; the range being access must fit a single page. @@ -1313,8 +1312,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ -bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, - unsigned long len); +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. -- cgit From 0318f207d1c2e297d1ec1c6e145bb8bd053236f9 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:28 +0000 Subject: KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_refresh() Make kvm_gpc_refresh() use kvm instance cached in gfn_to_pfn_cache. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj [sean: leave kvm_gpc_unmap() as-is] Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index befc8114ed0d..3ce4650776b8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1317,23 +1317,21 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * - * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. * @gpa: updated guest physical address to map. * @len: sanity check; the range being access must fit a single page. - * + * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. - * -EFAULT for an untranslatable guest physical address. + * -EFAULT for an untranslatable guest physical address. * * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful - * returm from this function does not mean the page can be immediately + * return from this function does not mean the page can be immediately * accessed because it may have raced with an invalidation. Callers must * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ -int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, - unsigned long len); +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache. -- cgit From 9f87791d686d85614584438d4f249eb32ef7964c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:29 +0000 Subject: KVM: Drop KVM's API to allow temporarily unmapping gfn=>pfn cache Drop kvm_gpc_unmap() as it has no users and unclear requirements. The API was added as part of the original gfn_to_pfn_cache support, but its sole usage[*] was never merged. Fold the guts of kvm_gpc_unmap() into the deactivate path and drop the API. Omit acquiring refresh_lock as as concurrent calls to kvm_gpc_deactivate() are not allowed (this is not enforced, e.g. via lockdep. due to it being called during vCPU destruction). If/when temporary unmapping makes a comeback, the desirable behavior is likely to restrict temporary unmapping to vCPU-exclusive mappings and require the vcpu->mutex be held to serialize unmap. Use of the refresh_lock to protect unmapping was somewhat specuatively added by commit 93984f19e7bc ("KVM: Fully serialize gfn=>pfn cache refresh via mutex") to guard against concurrent unmaps, but the primary use case of the temporary unmap, nested virtualization[*], doesn't actually need or want concurrent unmaps. [*] https://lore.kernel.org/all/20211210163625.2886-7-dwmw2@infradead.org Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3ce4650776b8..eac76965cf44 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1333,18 +1333,6 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); */ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); -/** - * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache. - * - * @kvm: pointer to kvm instance. - * @gpc: struct gfn_to_pfn_cache object. - * - * This unmaps the referenced page. The cache is left in the invalid state - * but at least the mapping from GPA to userspace HVA will remain cached - * and can be reused on a subsequent refresh. - */ -void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); - /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * -- cgit From 58f5ee5fedd981e05cb086cba4e8f923c3727a04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:31 +0000 Subject: KVM: Drop @gpa from exported gfn=>pfn cache check() and refresh() helpers Drop the @gpa param from the exported check()+refresh() helpers and limit changing the cache's GPA to the activate path. All external users just feed in gpc->gpa, i.e. this is a fancy nop. Allowing users to change the GPA at check()+refresh() is dangerous as those helpers explicitly allow concurrent calls, e.g. KVM could get into a livelock scenario. It's also unclear as to what the expected behavior should be if multiple tasks attempt to refresh with different GPAs. Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eac76965cf44..7008846fd3dd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1299,7 +1299,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. - * @gpa: current guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: %true if the cache is still valid and the address matches. @@ -1312,15 +1311,14 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ -bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * * @gpc: struct gfn_to_pfn_cache object. - * @gpa: updated guest physical address to map. * @len: sanity check; the range being access must fit a single page. - + * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. @@ -1331,7 +1329,7 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ -int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. -- cgit From dec1d352de5c6e2bcdbb03a2e7a84d85ad2e4f14 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 8 Nov 2022 10:43:57 -0800 Subject: mm: replace VM_WARN_ON to pr_warn if the node is offline with __GFP_THISNODE Syzbot reported the below splat: WARNING: CPU: 1 PID: 3646 at include/linux/gfp.h:221 __alloc_pages_node include/linux/gfp.h:221 [inline] WARNING: CPU: 1 PID: 3646 at include/linux/gfp.h:221 hpage_collapse_alloc_page mm/khugepaged.c:807 [inline] WARNING: CPU: 1 PID: 3646 at include/linux/gfp.h:221 alloc_charge_hpage+0x802/0xaa0 mm/khugepaged.c:963 Modules linked in: CPU: 1 PID: 3646 Comm: syz-executor210 Not tainted 6.1.0-rc1-syzkaller-00454-ga70385240892 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/11/2022 RIP: 0010:__alloc_pages_node include/linux/gfp.h:221 [inline] RIP: 0010:hpage_collapse_alloc_page mm/khugepaged.c:807 [inline] RIP: 0010:alloc_charge_hpage+0x802/0xaa0 mm/khugepaged.c:963 Code: e5 01 4c 89 ee e8 6e f9 ae ff 4d 85 ed 0f 84 28 fc ff ff e8 70 fc ae ff 48 8d 6b ff 4c 8d 63 07 e9 16 fc ff ff e8 5e fc ae ff <0f> 0b e9 96 fa ff ff 41 bc 1a 00 00 00 e9 86 fd ff ff e8 47 fc ae RSP: 0018:ffffc90003fdf7d8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888077f457c0 RSI: ffffffff81cd8f42 RDI: 0000000000000001 RBP: ffff888079388c0c R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: dffffc0000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f6b48ccf700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6b48a819f0 CR3: 00000000171e7000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: collapse_file+0x1ca/0x5780 mm/khugepaged.c:1715 hpage_collapse_scan_file+0xd6c/0x17a0 mm/khugepaged.c:2156 madvise_collapse+0x53a/0xb40 mm/khugepaged.c:2611 madvise_vma_behavior+0xd0a/0x1cc0 mm/madvise.c:1066 madvise_walk_vmas+0x1c7/0x2b0 mm/madvise.c:1240 do_madvise.part.0+0x24a/0x340 mm/madvise.c:1419 do_madvise mm/madvise.c:1432 [inline] __do_sys_madvise mm/madvise.c:1432 [inline] __se_sys_madvise mm/madvise.c:1430 [inline] __x64_sys_madvise+0x113/0x150 mm/madvise.c:1430 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f6b48a4eef9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 b1 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f6b48ccf318 EFLAGS: 00000246 ORIG_RAX: 000000000000001c RAX: ffffffffffffffda RBX: 00007f6b48af0048 RCX: 00007f6b48a4eef9 RDX: 0000000000000019 RSI: 0000000000600003 RDI: 0000000020000000 RBP: 00007f6b48af0040 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f6b48aa53a4 R13: 00007f6b48bffcbf R14: 00007f6b48ccf400 R15: 0000000000022000 It is because khugepaged allocates pages with __GFP_THISNODE, but the preferred node is bogus. The previous patch fixed the khugepaged code to avoid allocating page from non-existing node. But it is still racy against memory hotremove. There is no synchronization with the memory hotplug so it is possible that memory gets offline during a longer taking scanning. So this warning still seems not quite helpful because: * There is no guarantee the node is online for __GFP_THISNODE context for all the callsites. * Kernel just fails the allocation regardless the warning, and it looks all callsites handle the allocation failure gracefully. Although while the warning has helped to identify a buggy code, it is not safe in general and this warning could panic the system with panic-on-warn configuration which tends to be used surprisingly often. So replace VM_WARN_ON to pr_warn(). And the warning will be triggered if __GFP_NOWARN is set since the allocator would print out warning for such case if __GFP_NOWARN is not set. [shy828301@gmail.com: rename nid to this_node and gfp to warn_gfp] Link: https://lkml.kernel.org/r/20221123193014.153983-1-shy828301@gmail.com [akpm@linux-foundation.org: fix whitespace] [akpm@linux-foundation.org: print gfp_mask instead of warn_gfp, per Michel] Link: https://lkml.kernel.org/r/20221108184357.55614-3-shy828301@gmail.com Fixes: 7d8faaf15545 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Yang Shi Reported-by: Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/gfp.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ef4aea3b356e..65a78773dcca 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -210,6 +210,20 @@ alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct p return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); } +static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) +{ + gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); + + if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) + return; + + if (node_online(this_node)) + return; + + pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); + dump_stack(); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). @@ -218,7 +232,7 @@ static inline struct page * __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); - VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); + warn_if_node_offline(nid, gfp_mask); return __alloc_pages(gfp_mask, order, nid, NULL); } @@ -227,7 +241,7 @@ static inline struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); - VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid)); + warn_if_node_offline(nid, gfp); return __folio_alloc(gfp, order, nid, NULL); } -- cgit From 21b85b09527c28e242db55c1b751f7f7549b830c Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 14 Nov 2022 15:55:05 -0800 Subject: madvise: use zap_page_range_single for madvise dontneed This series addresses the issue first reported in [1], and fully described in patch 2. Patches 1 and 2 address the user visible issue and are tagged for stable backports. While exploring solutions to this issue, related problems with mmu notification calls were discovered. This is addressed in the patch "hugetlb: remove duplicate mmu notifications:". Since there are no user visible effects, this third is not tagged for stable backports. Previous discussions suggested further cleanup by removing the routine zap_page_range. This is possible because zap_page_range_single is now exported, and all callers of zap_page_range pass ranges entirely within a single vma. This work will be done in a later patch so as not to distract from this bug fix. [1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@mail.gmail.com/ This patch (of 2): Expose the routine zap_page_range_single to zap a range within a single vma. The madvise routine madvise_dontneed_single_vma can use this routine as it explicitly operates on a single vma. Also, update the mmu notification range in zap_page_range_single to take hugetlb pmd sharing into account. This is required as MADV_DONTNEED supports hugetlb vmas. Link: https://lkml.kernel.org/r/20221114235507.294320-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20221114235507.294320-2-mike.kravetz@oracle.com Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings") Signed-off-by: Mike Kravetz Reported-by: Wei Chen Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mina Almasry Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Peter Xu Cc: Rik van Riel Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..cbfb489d381c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1852,6 +1852,23 @@ static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodem __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); } +/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct folio *single_folio; /* Locked folio to be unmapped */ + bool even_cows; /* Zap COWed private pages too? */ + zap_flags_t zap_flags; /* Extra flags for zapping */ +}; + +/* + * Whether to drop the pte markers, for example, the uffd-wp information for + * file-backed memory. This should only be specified when we will completely + * drop the page in the mm, either by truncation or unmapping of the vma. By + * default, the flag is not set. + */ +#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) + #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else @@ -1869,6 +1886,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details); void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); @@ -3467,12 +3486,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif -/* - * Whether to drop the pte markers, for example, the uffd-wp information for - * file-backed memory. This should only be specified when we will completely - * drop the page in the mm, either by truncation or unmapping of the vma. By - * default, the flag is not set. - */ -#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) - #endif /* _LINUX_MM_H */ -- cgit From 04ada095dcfc4ae359418053c0be94453bdf1e84 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 14 Nov 2022 15:55:06 -0800 Subject: hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page tables associated with the address range. For hugetlb vmas, zap_page_range will call __unmap_hugepage_range_final. However, __unmap_hugepage_range_final assumes the passed vma is about to be removed and deletes the vma_lock to prevent pmd sharing as the vma is on the way out. In the case of madvise(MADV_DONTNEED) the vma remains, but the missing vma_lock prevents pmd sharing and could potentially lead to issues with truncation/fault races. This issue was originally reported here [1] as a BUG triggered in page_try_dup_anon_rmap. Prior to the introduction of the hugetlb vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to prevent pmd sharing. Subsequent faults on this vma were confused as VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was not set in new pages added to the page table. This resulted in pages that appeared anonymous in a VM_SHARED vma and triggered the BUG. Address issue by adding a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from unmap_vmas(). This is used to indicate the 'final' unmapping of a hugetlb vma. When called via MADV_DONTNEED, this flag is not set and the vm_lock is not deleted. [1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20221114235507.294320-3-mike.kravetz@oracle.com Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings") Signed-off-by: Mike Kravetz Reported-by: Wei Chen Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mina Almasry Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Peter Xu Cc: Rik van Riel Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cbfb489d381c..974ccca609d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1868,6 +1868,8 @@ struct zap_details { * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) +/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ +#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_MMU extern bool can_do_mlock(void); -- cgit From 6617da8fb565445e0be4a4885443006374943d09 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 30 Nov 2022 14:49:41 -0800 Subject: mm: add dummy pmd_young() for architectures not having it In order to avoid #ifdeffery add a dummy pmd_young() implementation as a fallback. This is required for the later patch "mm: introduce arch_has_hw_nonleaf_pmd_young()". Link: https://lkml.kernel.org/r/fd3ac3cd-7349-6bbd-890a-71a9454ca0b3@suse.com Signed-off-by: Juergen Gross Acked-by: Yu Zhao Cc: Borislav Petkov Cc: Dave Hansen Cc: Geert Uytterhoeven Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Sander Eikelenboom Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a108b60a6962..6b0d59269b33 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -165,6 +165,13 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr) return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } +#ifndef pmd_young +static inline int pmd_young(pmd_t pmd) +{ + return 0; +} +#endif + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, -- cgit From 4aaf269c768dbacd6268af73fda2ffccaa3f1d88 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 23 Nov 2022 07:45:10 +0100 Subject: mm: introduce arch_has_hw_nonleaf_pmd_young() When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation in pmdp_test_and_clear_young(): BUG: unable to handle page fault for address: ffff8880083374d0 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065 Oops: 0003 [#1] PREEMPT SMP NOPTI CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1 RIP: e030:pmdp_test_and_clear_young+0x25/0x40 This happens because the Xen hypervisor can't emulate direct writes to page table entries other than PTEs. This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young() similar to arch_has_hw_pte_young() and test that instead of CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG. Link: https://lkml.kernel.org/r/20221123064510.16225-1-jgross@suse.com Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") Signed-off-by: Juergen Gross Reported-by: Sander Eikelenboom Acked-by: Yu Zhao Tested-by: Sander Eikelenboom Acked-by: David Hildenbrand [core changes] Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6b0d59269b33..5f0d7d0b9471 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -267,6 +267,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef arch_has_hw_nonleaf_pmd_young +/* + * Return whether the accessed bit in non-leaf PMD entries is supported on the + * local CPU. + */ +static inline bool arch_has_hw_nonleaf_pmd_young(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); +} +#endif + #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. -- cgit From 1d351f1894342c378b96bb9ed89f8debb1e24e9f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 28 Nov 2022 13:21:40 -0800 Subject: revert "kbuild: fix -Wimplicit-function-declaration in license_is_gpl_compatible" It causes build failures with unusual CC/HOSTCC combinations. Quoting https://lkml.kernel.org/r/A222B1E6-69B8-4085-AD1B-27BDB72CA971@goldelico.com: HOSTCC scripts/mod/modpost.o - due to target missing In file included from include/linux/string.h:5, from scripts/mod/../../include/linux/license.h:5, from scripts/mod/modpost.c:24: include/linux/compiler.h:246:10: fatal error: asm/rwonce.h: No such file or directory 246 | #include | ^~~~~~~~~~~~~~ compilation terminated. ... The problem is that HOSTCC is not necessarily the same compiler or even architecture as CC and pulling in or files indirectly isn't a good idea then. My toolchain is providing HOSTCC = gcc (MacPorts) and CC = arm-linux-gnueabihf (built from gcc source) and all running on Darwin. If I change the include to I can then "HOSTCC scripts/mod/modpost.c" but then it fails for "CC kernel/module/main.c" not finding : CC kernel/module/main.o - due to target missing In file included from kernel/module/main.c:43:0: ./include/linux/license.h:5:20: fatal error: string.h: No such file or directory #include ^ compilation terminated. Reported-by: "H. Nikolaus Schaller" Cc: Sam James Signed-off-by: Andrew Morton --- include/linux/license.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/license.h b/include/linux/license.h index ad937f57f2cb..7cce390f120b 100644 --- a/include/linux/license.h +++ b/include/linux/license.h @@ -2,8 +2,6 @@ #ifndef __LICENSE_H #define __LICENSE_H -#include - static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 -- cgit From 0af8489b0216fa1dd83e264bef8063f2632633d7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Nov 2022 18:14:31 +0100 Subject: mm, slub: remove percpu slabs with CONFIG_SLUB_TINY SLUB gets most of its scalability by percpu slabs. However for CONFIG_SLUB_TINY the goal is minimal memory overhead, not scalability. Thus, #ifdef out the whole kmem_cache_cpu percpu structure and associated code. Additionally to the slab page savings, this reduces percpu allocator usage, and code size. This change builds on recent commit c7323a5ad078 ("mm/slub: restrict sysfs validation to debug caches and make it safe"), as caches with enabled debugging also avoid percpu slabs and all allocations and freeing ends up working with the partial list. With a bit more refactoring by the preceding patches, use the same code paths with CONFIG_SLUB_TINY. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slub_def.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 95b67863d5cf..aa0ee1678d29 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -41,6 +41,7 @@ enum stat_item { CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS }; +#ifndef CONFIG_SLUB_TINY /* * When changing the layout, make sure freelist and tid are still compatible * with this_cpu_cmpxchg_double() alignment requirements. @@ -57,6 +58,7 @@ struct kmem_cache_cpu { unsigned stat[NR_SLUB_STAT_ITEMS]; #endif }; +#endif /* CONFIG_SLUB_TINY */ #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) @@ -88,7 +90,9 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { +#ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; +#endif /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; -- cgit From c67cae551f0df80421b5703ee56ff5e2fe9c4de6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 25 Nov 2022 14:06:17 -0800 Subject: bpf: Tighten ptr_to_btf_id checks. The networking programs typically don't require CAP_PERFMON, but through kfuncs like bpf_cast_to_kern_ctx() they can access memory through PTR_TO_BTF_ID. In such case enforce CAP_PERFMON. Also make sure that only GPL programs can access kernel data structures. All kfuncs require GPL already. Also remove allow_ptr_to_map_access. It's the same as allow_ptr_leaks and different name for the same check only causes confusion. Fixes: fd264ca02094 ("bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx") Fixes: 50c6b8a9aea2 ("selftests/bpf: Add a test for btf_type_tag "percpu"") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221125220617.26846-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 5 ----- include/linux/bpf_verifier.h | 1 - 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 67452103bb86..4920ac252754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1909,11 +1909,6 @@ static inline bool bpf_allow_uninit_stack(void) return perfmon_capable(); } -static inline bool bpf_allow_ptr_to_map_access(void) -{ - return perfmon_capable(); -} - static inline bool bpf_bypass_spec_v1(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c05aa6e1f6f5..b5090e89cb3f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -531,7 +531,6 @@ struct bpf_verifier_env { bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; - bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; -- cgit From f1a7941243c102a44e8847e3b94ff4ff3ec56f25 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 24 Oct 2022 05:28:41 +0000 Subject: mm: convert mm's rss stats into percpu_counter Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 ++++++++------------------ include/linux/mm_types.h | 7 ++----- include/linux/mm_types_task.h | 13 ------------- include/linux/percpu_counter.h | 1 - include/linux/sched.h | 3 --- 5 files changed, 10 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f919befc8fac..0cb4e196d60b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); - -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; + return percpu_counter_read_positive(&mm->rss_stat[member]); } -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2d5b1575ffe0..e86861ff5bbd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -626,11 +627,7 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - /* - * Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - struct mm_rss_stat rss_stat; + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 0bb4b6da9993..5414b5c6a103 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,19 +36,6 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) -#define SPLIT_RSS_COUNTING -/* per-thread cached information, */ -struct task_rss_stat { - int events; /* for synchronization threshold */ - int count[NR_MM_COUNTERS]; -}; -#endif /* USE_SPLIT_PTE_PTLOCKS */ - -struct mm_rss_stat { - atomic_long_t count[NR_MM_COUNTERS]; -}; - struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8ed5fba6d156..bde6c4c1f405 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -13,7 +13,6 @@ #include #include #include -#include /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..079d299fa465 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,9 +870,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; -#ifdef SPLIT_RSS_COUNTING - struct task_rss_stat rss_stat; -#endif int exit_state; int exit_code; int exit_signal; -- cgit From f689054aace2ff13af2e9a44a74fbba650ca31ba Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 9 Nov 2022 01:20:11 +0000 Subject: percpu_counter: add percpu_counter_sum_all interface The percpu_counter is used for scenarios where performance is more important than the accuracy. For percpu_counter users, who want more accurate information in their slowpath, percpu_counter_sum is provided which traverses all the online CPUs to accumulate the data. The reason it only needs to traverse online CPUs is because percpu_counter does implement CPU offline callback which syncs the local data of the offlined CPU. However there is a small race window between the online CPUs traversal of percpu_counter_sum and the CPU offline callback. The offline callback has to traverse all the percpu_counters on the system to flush the CPU local data which can be a lot. During that time, the CPU which is going offline has already been published as offline to all the readers. So, as the offline callback is running, percpu_counter_sum can be called for one counter which has some state on the CPU going offline. Since percpu_counter_sum only traverses online CPUs, it will skip that specific CPU and the offline callback might not have flushed the state for that specific percpu_counter on that offlined CPU. Normally this is not an issue because percpu_counter users can deal with some inaccuracy for small time window. However a new user i.e. mm_struct on the cleanup path wants to check the exact state of the percpu_counter through check_mm(). For such users, this patch introduces percpu_counter_sum_all() which traverses all possible CPUs and it is used in fork.c:check_mm() to avoid the potential race. This issue is exposed by the later patch "mm: convert mm's rss stats into percpu_counter". Link: https://lkml.kernel.org/r/20221109012011.881058-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index bde6c4c1f405..a3aae8d57a42 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -45,6 +45,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); +s64 percpu_counter_sum_all(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); @@ -193,6 +194,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } +static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc) +{ + return percpu_counter_read(fbc); +} + static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; -- cgit From a873dfe1032a132bf89f9e19a6ac44f5a0b78754 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Oct 2022 13:01:19 -0700 Subject: mm, hwpoison: try to recover from copy-on write faults Patch series "Copy-on-write poison recovery", v3. Part 1 deals with the process that triggered the copy on write fault with a store to a shared read-only page. That process is send a SIGBUS with the usual machine check decoration to specify the virtual address of the lost page, together with the scope. Part 2 sets up to asynchronously take the page with the uncorrected error offline to prevent additional machine check faults. H/t to Miaohe Lin and Shuai Xue for pointing me to the existing function to queue a call to memory_failure(). On x86 there is some duplicate reporting (because the error is also signalled by the memory controller as well as by the core that triggered the machine check). Console logs look like this: This patch (of 2): If the kernel is copying a page as the result of a copy-on-write fault and runs into an uncorrectable error, Linux will crash because it does not have recovery code for this case where poison is consumed by the kernel. It is easy to set up a test case. Just inject an error into a private page, fork(2), and have the child process write to the page. I wrapped that neatly into a test at: git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git just enable ACPI error injection and run: # ./einj_mem-uc -f copy-on-write Add a new copy_user_highpage_mc() function that uses copy_mc_to_kernel() on architectures where that is available (currently x86 and powerpc). When an error is detected during the page copy, return VM_FAULT_HWPOISON to caller of wp_page_copy(). This propagates up the call stack. Both x86 and powerpc have code in their fault handler to deal with this code by sending a SIGBUS to the application. Note that this patch avoids a system crash and signals the process that triggered the copy-on-write action. It does not take any action for the memory error that is still in the shared page. To handle that a call to memory_failure() is needed. But this cannot be done from wp_page_copy() because it holds mmap_lock(). Perhaps the architecture fault handlers can deal with this loose end in a subsequent patch? On Intel/x86 this loose end will often be handled automatically because the memory controller provides an additional notification of the h/w poison in memory, the handler for this will call memory_failure(). This isn't a 100% solution. If there are multiple errors, not all may be logged in this way. [tony.luck@intel.com: add call to kmsan_unpoison_memory(), per Miaohe Lin] Link: https://lkml.kernel.org/r/20221031201029.102123-2-tony.luck@intel.com Link: https://lkml.kernel.org/r/20221021200120.175753-1-tony.luck@intel.com Link: https://lkml.kernel.org/r/20221021200120.175753-2-tony.luck@intel.com Signed-off-by: Tony Luck Reviewed-by: Dan Williams Reviewed-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Alexander Potapenko Tested-by: Shuai Xue Cc: Christophe Leroy Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/highmem.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e9912da5441b..44242268f53b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -319,6 +319,32 @@ static inline void copy_user_highpage(struct page *to, struct page *from, #endif +#ifdef copy_mc_to_kernel +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + unsigned long ret; + char *vfrom, *vto; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (!ret) + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); + kunmap_local(vto); + kunmap_local(vfrom); + + return ret; +} +#else +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_user_highpage(to, from, vaddr, vma); + return 0; +} +#endif + #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) -- cgit From d302c2398ba269e788a4f37ae57c07a7fcabaa42 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Oct 2022 13:01:20 -0700 Subject: mm, hwpoison: when copy-on-write hits poison, take page offline Cannot call memory_failure() directly from the fault handler because mmap_lock (and others) are held. It is important, but not urgent, to mark the source page as h/w poisoned and unmap it from other tasks. Use memory_failure_queue() to request a call to memory_failure() for the page with the error. Also provide a stub version for CONFIG_MEMORY_FAILURE=n Link: https://lkml.kernel.org/r/20221021200120.175753-3-tony.luck@intel.com Signed-off-by: Tony Luck Reviewed-by: Miaohe Lin Cc: Christophe Leroy Cc: Dan Williams Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Naoya Horiguchi Cc: Nicholas Piggin Cc: Shuai Xue Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cb4e196d60b..3950ef45b9a9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3280,7 +3280,6 @@ enum mf_flags { int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); -extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; @@ -3289,11 +3288,16 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE +extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else +static inline void memory_failure_queue(unsigned long pfn, int flags) +{ +} + static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { -- cgit From 57e9cc50f4dd926d6c38751799d25cad89fb2bd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 26 Oct 2022 14:01:33 -0400 Subject: mm: vmscan: split khugepaged stats from direct reclaim stats Direct reclaim stats are useful for identifying a potential source for application latency, as well as spotting issues with kswapd. However, khugepaged currently distorts the picture: as a kernel thread it doesn't impose allocation latencies on userspace, and it explicitly opts out of kswapd reclaim. Its activity showing up in the direct reclaim stats is misleading. Counting it as kswapd reclaim could also cause confusion when trying to understand actual kswapd behavior. Break out khugepaged from the direct reclaim counters into new pgsteal_khugepaged, pgdemote_khugepaged, pgscan_khugepaged counters. Test with a huge executable (CONFIG_READ_ONLY_THP_FOR_FS): pgsteal_kswapd 1342185 pgsteal_direct 0 pgsteal_khugepaged 3623 pgscan_kswapd 1345025 pgscan_direct 0 pgscan_khugepaged 3623 Link: https://lkml.kernel.org/r/20221026180133.377671-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Eric Bergen Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 6 ++++++ include/linux/vm_event_item.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 70162d707caf..f68865e19b0b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); +extern bool current_is_khugepaged(void); #ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); @@ -57,6 +58,11 @@ static inline int collapse_pte_mapped_thp(struct mm_struct *mm, static inline void khugepaged_min_free_kbytes_update(void) { } + +static inline bool current_is_khugepaged(void) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3518dba1e02f..7f5d1caf5890 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -40,10 +40,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, -- cgit From a098c977722ca27d3b4bfeb966767af3cce45f85 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:51 -0700 Subject: mm/hugetlb_cgroup: convert __set_hugetlb_cgroup() to folios Patch series "convert hugetlb_cgroup helper functions to folios", v2. This patch series continues the conversion of hugetlb code from being managed in pages to folios by converting many of the hugetlb_cgroup helper functions to use folios. This allows the core hugetlb functions to pass in a folio to these helper functions. This patch (of 9); Change __set_hugetlb_cgroup() to use folios so it is explicit that the function operates on a head page. Link: https://lkml.kernel.org/r/20221101223059.460937-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221101223059.460937-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 630cd255d0cf..7576e9ed8afe 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -90,31 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return __hugetlb_cgroup_from_page(page, true); } -static inline void __set_hugetlb_cgroup(struct page *page, +static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD, + set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD), (unsigned long)h_cg); else - set_page_private(page + SUBPAGE_INDEX_CGROUP, + set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP), (unsigned long)h_cg); } static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, false); + __set_hugetlb_cgroup(page_folio(page), h_cg, false); } static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, true); + __set_hugetlb_cgroup(page_folio(page), h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) -- cgit From f074732d599e19a2a5b12e54743ad5eaccbe6550 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:52 -0700 Subject: mm/hugetlb_cgroup: convert hugetlb_cgroup_from_page() to folios Introduce folios in __remove_hugetlb_page() by converting hugetlb_cgroup_from_page() to use folios. Also gets rid of unsed hugetlb_cgroup_from_page_resv() function. Link: https://lkml.kernel.org/r/20221101223059.460937-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7576e9ed8afe..feb2edafc8b6 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -67,27 +67,34 @@ struct hugetlb_cgroup { }; static inline struct hugetlb_cgroup * -__hugetlb_cgroup_from_page(struct page *page, bool rsvd) +__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); + struct page *tail; - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - if (rsvd) - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD); - else - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP); + + if (rsvd) { + tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD); + return (void *)page_private(tail); + } + + else { + tail = folio_page(folio, SUBPAGE_INDEX_CGROUP); + return (void *)page_private(tail); + } } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, false); + return __hugetlb_cgroup_from_folio(folio, false); } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, true); + return __hugetlb_cgroup_from_folio(folio, true); } static inline void __set_hugetlb_cgroup(struct folio *folio, @@ -181,19 +188,13 @@ static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, { } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) -{ - return NULL; -} - -static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_resv(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return NULL; } -- cgit From de656ed376c4cb47c5713fba52f8bbfbea44f387 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:53 -0700 Subject: mm/hugetlb_cgroup: convert set_hugetlb_cgroup*() to folios Allows __prep_new_huge_page() to operate on a folio by converting set_hugetlb_cgroup*() to take in a folio. Link: https://lkml.kernel.org/r/20221101223059.460937-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index feb2edafc8b6..a7e3540f7f38 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -112,16 +112,16 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, (unsigned long)h_cg); } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page_folio(page), h_cg, false); + __set_hugetlb_cgroup(folio, h_cg, false); } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page_folio(page), h_cg, true); + __set_hugetlb_cgroup(folio, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) @@ -199,12 +199,12 @@ hugetlb_cgroup_from_folio_rsvd(struct folio *folio) return NULL; } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { } -- cgit From 29f394304f624b06fafb3cc9c3da8779f71f4bee Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:54 -0700 Subject: mm/hugetlb_cgroup: convert hugetlb_cgroup_migrate to folios Cleans up intermediate page to folio conversion code in hugetlb_cgroup_migrate() by changing its arguments from pages to folios. Link: https://lkml.kernel.org/r/20221101223059.460937-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index a7e3540f7f38..789b6fef176d 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -177,8 +177,8 @@ extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, bool region_del); extern void hugetlb_cgroup_file_init(void) __init; -extern void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage); +extern void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio); #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, @@ -286,8 +286,8 @@ static inline void hugetlb_cgroup_file_init(void) { } -static inline void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage) +static inline void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio) { } -- cgit From d4ab0316cc33aeedf6dcb1c2c25e097a25766132 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:57 -0700 Subject: mm/hugetlb_cgroup: convert hugetlb_cgroup_uncharge_page() to folios Continue to use a folio inside free_huge_page() by converting hugetlb_cgroup_uncharge_page*() to folios. Link: https://lkml.kernel.org/r/20221101223059.460937-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 789b6fef176d..c70f92fe493e 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -158,10 +158,10 @@ extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page); -extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page); -extern void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page); +extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio); +extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); @@ -254,14 +254,14 @@ hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, { } -static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { } -static inline void hugetlb_cgroup_uncharge_page_rsvd(int idx, +static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, -- cgit From 345c62d163496ae4b5c1ce530b1588067d8f5a8b Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:59 -0700 Subject: mm/hugetlb: convert move_hugetlb_state() to folios Clean up unmap_and_move_huge_page() by converting move_hugetlb_state() to take in folios. [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n build] Link: https://lkml.kernel.org/r/20221101223059.460937-10-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65ea34022aa2..58a30938a9b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -187,7 +187,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void putback_active_hugepage(struct page *page); -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -407,8 +407,8 @@ static inline void putback_active_hugepage(struct page *page) { } -static inline void move_hugetlb_state(struct page *oldpage, - struct page *newpage, int reason) +static inline void move_hugetlb_state(struct folio *old_folio, + struct folio *new_folio, int reason) { } @@ -991,6 +991,11 @@ void hugetlb_unregister_node(struct node *node); #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return NULL; +} + static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { return NULL; -- cgit From 44467bbb7e81ebcef2a5bfc9d6546bf7cd015374 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:21 +0000 Subject: mm/damon/core: add a callback for scheme target regions check Patch series "efficiently expose damos action tried regions information". DAMON users can retrieve the monitoring results via 'after_aggregation' callbacks if the user is using the kernel API, or 'damon_aggregated' tracepoint if the user is in the user space. Those are useful if full monitoring results are necessary. However, if the user has interest in only a snapshot of the results for some regions having specific access pattern, the interfaces could be inefficient. For example, some users only want to know which memory regions are not accessed for more than a specific time at the moment. Also, some DAMOS users would want to know exactly to what memory regions the schemes' actions tried to be applied, for a debugging or a tuning. As DAMOS has its internal mechanism for quota and regions prioritization, the users would need to simulate DAMOS' mechanism against the monitoring results. That's unnecessarily complex. This patchset implements DAMON kernel API callbacks and sysfs directory for efficient exposure of the information for the use cases. The new callback will be called for each region when a DAMOS action is gonna tried to be applied to it. The sysfs directory will be called 'tried_regions' and placed under each scheme sysfs directory. Users can write a special keyworkd, 'update_schemes_regions', to the 'state' file of a kdamond sysfs directory. Then, DAMON sysfs interface will fill the directory with the information of regions that corresponding scheme action was tried to be applied for next one aggregation interval. Patches Sequence ---------------- The first one (patch 1) implements the callback for the kernel space users. Following two patches (patches 2 and 3) implements sysfs directories for the information and its sub directories. Two patches (patches 4 and 5) for implementing the special keywords for filling the data to and cleaning up the directories follow. Patch 6 adds a selftest for the new sysfs directory. Finally, two patches (patches 7 and 8) document the new feature in the administrator guide and the ABI document. This patch (of 8): Getting DAMON monitoring results of only specific access pattern (e.g., getting address ranges of memory that not accessed at all for two minutes) can be useful for efficient monitoring of the system. The information can also be helpful for deep level investigation of DAMON-based operation schemes. For that, users need to record (in case of the user space users) or iterate (in case of the kernel space users) full monitoring results and filter it out for the specific access pattern. In case of the DAMOS investigation, users will even need to simulate DAMOS' quota and prioritization mechanisms. It's inefficient and complex. Add a new DAMON callback that will be called before each scheme is applied to each region. DAMON kernel API users will be able to do the query-like monitoring results collection, or DAMOS investigation in an efficient and simple way using it. Commits for providing the capability to the user space users will follow. Link: https://lkml.kernel.org/r/20221101220328.95765-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 620ada094c3b..35630634d790 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,7 @@ struct damon_operations { * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. + * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * @private: User private data. * @@ -385,6 +386,10 @@ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); + int (*before_damos_apply)(struct damon_ctx *context, + struct damon_target *target, + struct damon_region *region, + struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; -- cgit From ca92ea3dc5a2b01f98e9f02b7a6bc03be06fe124 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 30 Oct 2022 17:41:50 -0400 Subject: mm: always compile in pte markers Patch series "mm: Use pte marker for swapin errors". This series uses the pte marker to replace the swapin error swap entry, then we save one more swap entry slot for swap devices. A new pte marker bit is defined. This patch (of 2): The PTE markers code is tiny and now it's enabled for most of the distributions. It's fine to keep it as-is, but to make a broader use of it (e.g. replacing read error swap entry) it needs to be there always otherwise we need special code path to take care of !PTE_MARKER case. It'll be easier just make pte marker always exist. Use this chance to extend its usage to anonymous too by simply touching up some of the old comments, because it'll be used for anonymous pages in the follow up patches. Link: https://lkml.kernel.org/r/20221030214151.402274-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221030214151.402274-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Huang Ying Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++------- include/linux/swapops.h | 31 ------------------------------- 2 files changed, 3 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 369d7799205d..211aeca9bfa7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -60,17 +60,13 @@ static inline int current_is_kswapd(void) SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ SWP_PTE_MARKER_NUM) /* - * PTE markers are used to persist information onto PTEs that are mapped with - * file-backed memories. As its name "PTE" hints, it should only be applied to - * the leaves of pgtables. + * PTE markers are used to persist information onto PTEs that otherwise + * should be a none pte. As its name "PTE" hints, it should only be + * applied to the leaves of pgtables. */ -#ifdef CONFIG_PTE_MARKER #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) -#else -#define SWP_PTE_MARKER_NUM 0 -#endif /* * Unaddressable device memory support. See include/linux/hmm.h and diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3ba9bf56899d..35c1fe62d2e1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -412,8 +412,6 @@ typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) #define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) -#ifdef CONFIG_PTE_MARKER - static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); @@ -434,32 +432,6 @@ static inline bool is_pte_marker(pte_t pte) return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } -#else /* CONFIG_PTE_MARKER */ - -static inline swp_entry_t make_pte_marker_entry(pte_marker marker) -{ - /* This should never be called if !CONFIG_PTE_MARKER */ - WARN_ON_ONCE(1); - return swp_entry(0, 0); -} - -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return false; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return 0; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return false; -} - -#endif /* CONFIG_PTE_MARKER */ - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -477,9 +449,6 @@ static inline pte_t make_pte_marker(pte_marker marker) * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. - * - * For systems configured with !CONFIG_PTE_MARKER this will be automatically - * optimized to pte_none(). */ static inline int pte_none_mostly(pte_t pte) { -- cgit From 15520a3f046998e3f57e695743e99b0875e2dae7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 30 Oct 2022 17:41:51 -0400 Subject: mm: use pte markers for swap errors PTE markers are ideal mechanism for things like SWP_SWAPIN_ERROR. Using a whole swap entry type for this purpose can be an overkill, especially if we already have PTE markers. Define a new bit for swapin error and replace it with pte markers. Then we can safely drop SWP_SWAPIN_ERROR and give one device slot back to swap. We used to have SWP_SWAPIN_ERROR taking the page pfn as part of the swap entry, but it's never used. Neither do I see how it can be useful because normally the swapin failure should not be caused by a bad page but bad swap device. Drop it alongside. Link: https://lkml.kernel.org/r/20221030214151.402274-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Huang Ying Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 +----- include/linux/swapops.h | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 211aeca9bfa7..fec6647a289a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,10 +55,6 @@ static inline int current_is_kswapd(void) * actions on faults. */ -#define SWP_SWAPIN_ERROR_NUM 1 -#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ - SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ - SWP_PTE_MARKER_NUM) /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be @@ -121,7 +117,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ - SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) + SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 35c1fe62d2e1..27ade4f22abb 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -162,16 +162,6 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } -static inline swp_entry_t make_swapin_error_entry(struct page *page) -{ - return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); -} - -static inline int is_swapin_error_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_SWAPIN_ERROR; -} - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -409,8 +399,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; -#define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) +#define PTE_MARKER_UFFD_WP BIT(0) +#define PTE_MARKER_SWAPIN_ERROR BIT(1) +#define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { @@ -437,6 +428,17 @@ static inline pte_t make_pte_marker(pte_marker marker) return swp_entry_to_pte(make_pte_marker_entry(marker)); } +static inline swp_entry_t make_swapin_error_entry(void) +{ + return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); +} + +static inline int is_swapin_error_entry(swp_entry_t entry) +{ + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); +} + /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker -- cgit From dad6a5eb55564845aa17b8b20fa834af21e46c48 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:48:45 -0700 Subject: mm,hugetlb: use folio fields in second tail page Patch series "mm,huge,rmap: unify and speed up compound mapcounts". This patch (of 3): We want to declare one more int in the first tail of a compound page: that first tail page being valuable property, since every compound page has a first tail, but perhaps no more than that. No problem on 64-bit: there is already space for it. No problem with 32-bit THPs: 5.18 commit 5232c63f46fd ("mm: Make compound_pincount always available") kindly cleared the space for it, apparently not realizing that only 64-bit architectures enable CONFIG_THP_SWAP (whose use of tail page->private might conflict) - but make sure of that in its Kconfig. But hugetlb pages use tail page->private of the first tail page for a subpool pointer, which will conflict; and they also use page->private of the 2nd, 3rd and 4th tails. Undo "mm: add private field of first tail to struct page and struct folio"'s recent addition of private_1 to the folio tail: instead add hugetlb_subpool, hugetlb_cgroup, hugetlb_cgroup_rsvd, hugetlb_hwpoison to a second tail page of the folio: THP has long been using several fields of that tail, so make better use of it for hugetlb too. This is not how a generic folio should be declared in future, but it is an effective transitional way to make use of it. Delete the SUBPAGE_INDEX stuff, but keep __NR_USED_SUBPAGE: now 3. [hughd@google.com: prefix folio's page_1 and page_2 with double underscore, give folio's _flags_2 and _head_2 a line documentation each] Link: https://lkml.kernel.org/r/9e2cb6b-5b58-d3f2-b5ee-5f8a14e8f10@google.com Link: https://lkml.kernel.org/r/5f52de70-975-e94f-f141-543765736181@google.com Link: https://lkml.kernel.org/r/3818cc9a-9999-d064-d778-9c94c5911e6@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 23 +++---------- include/linux/hugetlb_cgroup.h | 31 +++++------------- include/linux/mm_types.h | 74 ++++++++++++++++++++++++++++-------------- 3 files changed, 64 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 58a30938a9b1..551834cd5299 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,22 +33,9 @@ typedef struct { unsigned long pd; } hugepd_t; /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail - * struct page to store the metadata. In order to avoid conflicts caused by - * subsequent use of more tail struct pages, we gather these discrete indexes - * of tail struct page here. + * struct page to store the metadata. */ -enum { - SUBPAGE_INDEX_SUBPOOL = 1, /* reuse page->private */ -#ifdef CONFIG_CGROUP_HUGETLB - SUBPAGE_INDEX_CGROUP, /* reuse page->private */ - SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ - __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, -#endif -#ifdef CONFIG_MEMORY_FAILURE - SUBPAGE_INDEX_HWPOISON, -#endif - __NR_USED_SUBPAGE, -}; +#define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; @@ -722,11 +709,11 @@ extern unsigned int default_hstate_idx; static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { - return (void *)folio_get_private_1(folio); + return folio->_hugetlb_subpool; } /* - * hugetlb page subpool pointer located in hpage[1].private + * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { @@ -736,7 +723,7 @@ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { - folio_set_private_1(folio, (unsigned long)subpool); + folio->_hugetlb_subpool = subpool; } static inline void hugetlb_set_page_subpool(struct page *hpage, diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index c70f92fe493e..f706626a8063 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -24,12 +24,10 @@ struct file_region; #ifdef CONFIG_CGROUP_HUGETLB /* * Minimum page order trackable by hugetlb cgroup. - * At least 4 pages are necessary for all the tracking information. - * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault - * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD]) - * is the reservation usage cgroup. + * At least 3 pages are necessary for all the tracking information. + * The second tail page contains all of the hugetlb-specific fields. */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1) +#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) enum hugetlb_memory_event { HUGETLB_MAX, @@ -69,21 +67,13 @@ struct hugetlb_cgroup { static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - struct page *tail; - VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - - if (rsvd) { - tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD); - return (void *)page_private(tail); - } - - else { - tail = folio_page(folio, SUBPAGE_INDEX_CGROUP); - return (void *)page_private(tail); - } + if (rsvd) + return folio->_hugetlb_cgroup_rsvd; + else + return folio->_hugetlb_cgroup; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) @@ -101,15 +91,12 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD), - (unsigned long)h_cg); + folio->_hugetlb_cgroup_rsvd = h_cg; else - set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP), - (unsigned long)h_cg); + folio->_hugetlb_cgroup = h_cg; } static inline void set_hugetlb_cgroup(struct folio *folio, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e86861ff5bbd..44f1f8b6be02 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -145,15 +145,22 @@ struct page { atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ - unsigned long _private_1; #endif }; - struct { /* Second tail page of compound page */ + struct { /* Second tail page of transparent huge page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; /* For both global and memcg */ struct list_head deferred_list; }; + struct { /* Second tail page of hugetlb page */ + unsigned long _hugetlb_pad_1; /* compound_head */ + void *hugetlb_subpool; + void *hugetlb_cgroup; + void *hugetlb_cgroup_rsvd; + void *hugetlb_hwpoison; + /* No more space on 32-bit: use third tail if more */ + }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ @@ -260,13 +267,18 @@ struct page { * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @_flags_1: For large folios, additional page flags. - * @__head: Points to the folio. Do not use. + * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_private_1: Do not use directly, call folio_get_private_1(). + * @_flags_2: For alignment. Do not use. + * @_head_2: Points to the folio. Do not use. + * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. + * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -305,16 +317,31 @@ struct folio { }; struct page page; }; - unsigned long _flags_1; - unsigned long __head; - unsigned char _folio_dtor; - unsigned char _folio_order; - atomic_t _total_mapcount; - atomic_t _pincount; + union { + struct { + unsigned long _flags_1; + unsigned long _head_1; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; #ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; + unsigned int _folio_nr_pages; #endif - unsigned long _private_1; + }; + struct page __page_1; + }; + union { + struct { + unsigned long _flags_2; + unsigned long _head_2; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; + }; + struct page __page_2; + }; }; #define FOLIO_MATCH(pg, fl) \ @@ -335,16 +362,25 @@ FOLIO_MATCH(memcg_data, memcg_data); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); FOLIO_MATCH(compound_mapcount, _total_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); -FOLIO_MATCH(_private_1, _private_1); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 2 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_2); +FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); +FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); +FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); +FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { @@ -388,16 +424,6 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } -static inline void folio_set_private_1(struct folio *folio, unsigned long private) -{ - folio->_private_1 = private; -} - -static inline unsigned long folio_get_private_1(struct folio *folio) -{ - return folio->_private_1; -} - struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) -- cgit From cb67f4282bf9693658dbda934a441ddbbb1446df Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:51:38 -0700 Subject: mm,thp,rmap: simplify compound page mapcount handling Compound page (folio) mapcount calculations have been different for anon and file (or shmem) THPs, and involved the obscure PageDoubleMap flag. And each huge mapping and unmapping of a file (or shmem) THP involved atomically incrementing and decrementing the mapcount of every subpage of that huge page, dirtying many struct page cachelines. Add subpages_mapcount field to the struct folio and first tail page, so that the total of subpage mapcounts is available in one place near the head: then page_mapcount() and total_mapcount() and page_mapped(), and their folio equivalents, are so quick that anon and file and hugetlb don't need to be optimized differently. Delete the unloved PageDoubleMap. page_add and page_remove rmap functions must now maintain the subpages_mapcount as well as the subpage _mapcount, when dealing with pte mappings of huge pages; and correct maintenance of NR_ANON_MAPPED and NR_FILE_MAPPED statistics still needs reading through the subpages, using nr_subpages_unmapped() - but only when first or last pmd mapping finds subpages_mapcount raised (double-map case, not the common case). But are those counts (used to decide when to split an anon THP, and in vmscan's pagecache_reclaimable heuristic) correctly maintained? Not quite: since page_remove_rmap() (and also split_huge_pmd()) is often called without page lock, there can be races when a subpage pte mapcount 0<->1 while compound pmd mapcount 0<->1 is scanning - races which the previous implementation had prevented. The statistics might become inaccurate, and even drift down until they underflow through 0. That is not good enough, but is better dealt with in a followup patch. Update a few comments on first and second tail page overlaid fields. hugepage_add_new_anon_rmap() has to "increment" compound_mapcount, but subpages_mapcount and compound_pincount are already correctly at 0, so delete its reinitialization of compound_pincount. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 119 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 92ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. But there might be some benchmarks which would show a slowdown, because tail struct pages now fall out of cache until final freeing checks them. Link: https://lkml.kernel.org/r/47ad693-717-79c8-e1ba-46c3a6602e48@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/mm.h | 85 +++++++++++++++++++++++++++++++++------------- include/linux/mm_types.h | 21 ++++++++++-- include/linux/page-flags.h | 21 ------------ include/linux/rmap.h | 2 ++ 4 files changed, 82 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3950ef45b9a9..a904c2d60f12 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -818,8 +818,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes; look at folio_mapcount() or page_mapcount() - * instead. + * debugging purposes - it does not include PTE-mapped sub-pages; look + * at folio_mapcount() or page_mapcount() or total_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -829,12 +829,20 @@ static inline int folio_entire_mapcount(struct folio *folio) /* * Mapcount of compound page as a whole, does not include mapped sub-pages. - * - * Must be called only for compound pages. + * Must be called only on head of compound page. */ -static inline int compound_mapcount(struct page *page) +static inline int head_compound_mapcount(struct page *head) { - return folio_entire_mapcount(page_folio(page)); + return atomic_read(compound_mapcount_ptr(head)) + 1; +} + +/* + * Sum of mapcounts of sub-pages, does not include compound mapcount. + * Must be called only on head of compound page. + */ +static inline int head_subpages_mapcount(struct page *head) +{ + return atomic_read(subpages_mapcount_ptr(head)); } /* @@ -847,11 +855,9 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -int __page_mapcount(struct page *page); - /* * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount(). + * compound_mapcount of compound_head of page. * * Result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). @@ -859,25 +865,61 @@ int __page_mapcount(struct page *page); */ static inline int page_mapcount(struct page *page) { - if (unlikely(PageCompound(page))) - return __page_mapcount(page); - return atomic_read(&page->_mapcount) + 1; -} + int mapcount = atomic_read(&page->_mapcount) + 1; -int folio_mapcount(struct folio *folio); + if (likely(!PageCompound(page))) + return mapcount; + page = compound_head(page); + return head_compound_mapcount(page) + mapcount; +} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int total_mapcount(struct page *page) { - return folio_mapcount(page_folio(page)); + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + return head_compound_mapcount(page) + head_subpages_mapcount(page); } -#else -static inline int total_mapcount(struct page *page) +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped, + * even if this particular subpage is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) { - return page_mapcount(page); + return total_mapcount(page) > 0; +} + +/** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +static inline int folio_mapcount(struct folio *folio) +{ + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + return atomic_read(folio_mapcount_ptr(folio)) + 1 + + atomic_read(folio_subpages_mapcount_ptr(folio)); +} + +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. + */ +static inline bool folio_mapped(struct folio *folio) +{ + return folio_mapcount(folio) > 0; } -#endif static inline struct page *virt_to_head_page(const void *x) { @@ -1800,9 +1842,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -bool page_mapped(struct page *page); -bool folio_mapped(struct folio *folio); - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 44f1f8b6be02..44a1a699b5ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -142,6 +142,7 @@ struct page { unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; + atomic_t subpages_mapcount; atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ @@ -270,7 +271,8 @@ struct page { * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_subpages_mapcount: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_flags_2: For alignment. Do not use. @@ -323,7 +325,8 @@ struct folio { unsigned long _head_1; unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _total_mapcount; + atomic_t _compound_mapcount; + atomic_t _subpages_mapcount; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; @@ -365,7 +368,8 @@ FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_mapcount, _compound_mapcount); +FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); @@ -388,11 +392,22 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } +static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->subpages_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; } +static inline atomic_t *subpages_mapcount_ptr(struct page *page) +{ + return &page[1].subpages_mapcount; +} + static inline atomic_t *compound_pincount_ptr(struct page *page) { return &page[1].compound_pincount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..e42c55a7e012 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -176,9 +176,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - /* Compound pages. Stored in first tail page's flags */ - PG_double_map = PG_workingset, - #ifdef CONFIG_MEMORY_FAILURE /* * Compound pages. Stored in first tail page's flags. @@ -874,29 +871,11 @@ static inline int PageTransTail(struct page *page) { return PageTail(page); } - -/* - * PageDoubleMap indicates that the compound page is mapped with PTEs as well - * as PMDs. - * - * This is required for optimization of rmap operations for THP: we can postpone - * per small page mapcount accounting (and its overhead from atomic operations) - * until the first PMD split. - * - * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up - * by one. This reference will go away with last compound_mapcount. - * - * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). - */ -PAGEFLAG(DoubleMap, double_map, PF_SECOND) - TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) TESTPAGEFLAG_FALSE(TransTail, transtail) -PAGEFLAG_FALSE(DoubleMap, double_map) - TESTSCFLAG_FALSE(DoubleMap, double_map) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..1973649e8f93 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -206,6 +206,8 @@ void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, static inline void __page_dup_rmap(struct page *page, bool compound) { + if (!compound && PageCompound(page)) + atomic_inc(subpages_mapcount_ptr(compound_head(page))); atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } -- cgit From 9bd3155ed83b723be719e522760f107229e2a61b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:53:45 -0700 Subject: mm,thp,rmap: lock_compound_mapcounts() on THP mapcounts Fix the races in maintaining compound_mapcount, subpages_mapcount and subpage _mapcount by using PG_locked in the first tail of any compound page for a bit_spin_lock() on such modifications; skipping the usual atomic operations on those fields in this case. Bring page_remove_file_rmap() and page_remove_anon_compound_rmap() back into page_remove_rmap() itself. Rearrange page_add_anon_rmap() and page_add_file_rmap() and page_remove_rmap() to follow the same "if (compound) {lock} else if (PageCompound) {lock} else {atomic}" pattern (with a PageTransHuge in the compound test, like before, to avoid BUG_ONs and optimize away that block when THP is not configured). Move all the stats updates outside, after the bit_spin_locked section, so that it is sure to be a leaf lock. Add page_dup_compound_rmap() to manage compound locking versus atomics in sync with the rest. In particular, hugetlb pages are still using the atomics: to avoid unnecessary interference there, and because they never have subpage mappings; but this exception can easily be changed. Conveniently, page_dup_compound_rmap() turns out to suit an anon THP's __split_huge_pmd_locked() too. bit_spin_lock() is not popular with PREEMPT_RT folks: but PREEMPT_RT sensibly excludes TRANSPARENT_HUGEPAGE already, so its only exposure is to the non-hugetlb non-THP pte-mapped compound pages (with large folios being currently dependent on TRANSPARENT_HUGEPAGE). There is never any scan of subpages in this case; but we have chosen to use PageCompound tests rather than PageTransCompound tests to gate the use of lock_compound_mapcounts(), so that page_mapped() is correct on all compound pages, whether or not TRANSPARENT_HUGEPAGE is enabled: could that be a problem for PREEMPT_RT, when there is contention on the lock - under heavy concurrent forking for example? If so, then it can be turned into a sleeping lock (like folio_lock()) when PREEMPT_RT. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 115 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 86ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. Mapping huge pages by ptes is largely unaffected but variable: between 5% faster and 5% slower in what I've recorded. Contention on the lock is likely to behave worse than contention on the atomics behaved. Link: https://lkml.kernel.org/r/1b42bd1a-8223-e827-602f-d466c2db7d3c@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/rmap.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1973649e8f93..011a7530dc76 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,16 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -static inline void __page_dup_rmap(struct page *page, bool compound) -{ - if (!compound && PageCompound(page)) - atomic_inc(subpages_mapcount_ptr(compound_head(page))); - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); -} +void page_dup_compound_rmap(struct page *page, bool compound); static inline void page_dup_file_rmap(struct page *page, bool compound) { - __page_dup_rmap(page, compound); + if (PageCompound(page)) + page_dup_compound_rmap(page, compound); + else + atomic_inc(&page->_mapcount); } /** @@ -262,7 +260,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * the page R/O into both processes. */ dup: - __page_dup_rmap(page, compound); + page_dup_file_rmap(page, compound); return 0; } -- cgit From be5ef2d9b006bbd93b1a03e1da2dbd19fb0b9f14 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:42:04 -0800 Subject: mm,thp,rmap: subpages_mapcount of PTE-mapped subpages Patch series "mm,thp,rmap: rework the use of subpages_mapcount", v2. This patch (of 3): Following suggestion from Linus, instead of counting every PTE map of a compound page in subpages_mapcount, just count how many of its subpages are PTE-mapped: this yields the exact number needed for NR_ANON_MAPPED and NR_FILE_MAPPED stats, without any need for a locked scan of subpages; and requires updating the count less often. This does then revert total_mapcount() and folio_mapcount() to needing a scan of subpages; but they are inherently racy, and need no locking, so Linus is right that the scans are much better done there. Plus (unlike in 6.1 and previous) subpages_mapcount lets us avoid the scan in the common case of no PTE maps. And page_mapped() and folio_mapped() remain scanless and just as efficient with the new meaning of subpages_mapcount: those are the functions which I most wanted to remove the scan from. The updated page_dup_compound_rmap() is no longer suitable for use by anon THP's __split_huge_pmd_locked(); but page_add_anon_rmap() can be used for that, so long as its VM_BUG_ON_PAGE(!PageLocked) is deleted. Evidence is that this way goes slightly faster than the previous implementation for most cases; but significantly faster in the (now scanless) pmds after ptes case, which started out at 870ms and was brought down to 495ms by the previous series, now takes around 105ms. Link: https://lkml.kernel.org/r/a5849eca-22f1-3517-bf29-95d982242742@google.com Link: https://lkml.kernel.org/r/eec17e16-4e1-7c59-f1bc-5bca90dac919@google.com Signed-off-by: Hugh Dickins Suggested-by: Linus Torvalds Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/mm.h | 52 +++++++++++++++++++++++++++++++--------------------- include/linux/rmap.h | 9 +++++---- 2 files changed, 36 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a904c2d60f12..84fb91f6f56e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -837,7 +837,7 @@ static inline int head_compound_mapcount(struct page *head) } /* - * Sum of mapcounts of sub-pages, does not include compound mapcount. + * Number of sub-pages mapped by PTE, does not include compound mapcount. * Must be called only on head of compound page. */ static inline int head_subpages_mapcount(struct page *head) @@ -873,23 +873,7 @@ static inline int page_mapcount(struct page *page) return head_compound_mapcount(page) + mapcount; } -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - page = compound_head(page); - return head_compound_mapcount(page) + head_subpages_mapcount(page); -} - -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped, - * even if this particular subpage is not itself mapped by any PTE or PMD. - */ -static inline bool page_mapped(struct page *page) -{ - return total_mapcount(page) > 0; -} +int total_compound_mapcount(struct page *head); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -906,8 +890,20 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return atomic_read(folio_mapcount_ptr(folio)) + 1 + - atomic_read(folio_subpages_mapcount_ptr(folio)); + return total_compound_mapcount(&folio->page); +} + +static inline int total_mapcount(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + return total_compound_mapcount(compound_head(page)); +} + +static inline bool folio_large_is_mapped(struct folio *folio) +{ + return atomic_read(folio_mapcount_ptr(folio)) + + atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; } /** @@ -918,7 +914,21 @@ static inline int folio_mapcount(struct folio *folio) */ static inline bool folio_mapped(struct folio *folio) { - return folio_mapcount(folio) > 0; + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) >= 0; + return folio_large_is_mapped(folio); +} + +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any sub-page of compound page is mapped, + * even if this particular sub-page is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + return folio_large_is_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 011a7530dc76..5dadb9a3e010 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,14 +204,15 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -void page_dup_compound_rmap(struct page *page, bool compound); +void page_dup_compound_rmap(struct page *page); static inline void page_dup_file_rmap(struct page *page, bool compound) { - if (PageCompound(page)) - page_dup_compound_rmap(page, compound); - else + /* Is page being mapped by PTE? */ + if (likely(!compound)) atomic_inc(&page->_mapcount); + else + page_dup_compound_rmap(page); } /** -- cgit From 4b51634cd16a01b2be0f6b69cc0dae63de4751f2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:49:36 -0800 Subject: mm,thp,rmap: subpages_mapcount COMPOUND_MAPPED if PMD-mapped Can the lock_compound_mapcount() bit_spin_lock apparatus be removed now? Yes. Not by atomic64_t or cmpxchg games, those get difficult on 32-bit; but if we slightly abuse subpages_mapcount by additionally demanding that one bit be set there when the compound page is PMD-mapped, then a cascade of two atomic ops is able to maintain the stats without bit_spin_lock. This is harder to reason about than when bit_spin_locked, but I believe safe; and no drift in stats detected when testing. When there are racing removes and adds, of course the sequence of operations is less well- defined; but each operation on subpages_mapcount is atomically good. What might be disastrous, is if subpages_mapcount could ever fleetingly appear negative: but the pte lock (or pmd lock) these rmap functions are called under, ensures that a last remove cannot race ahead of a first add. Continue to make an exception for hugetlb (PageHuge) pages, though that exception can be easily removed by a further commit if necessary: leave subpages_mapcount 0, don't bother with COMPOUND_MAPPED in its case, just carry on checking compound_mapcount too in folio_mapped(), page_mapped(). Evidence is that this way goes slightly faster than the previous implementation in all cases (pmds after ptes now taking around 103ms); and relieves us of worrying about contention on the bit_spin_lock. Link: https://lkml.kernel.org/r/3978f3ca-5473-55a7-4e14-efea5968d892@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 ++++++++++++++++--- include/linux/rmap.h | 13 ++++++------- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 84fb91f6f56e..d33639be3db3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -836,13 +836,22 @@ static inline int head_compound_mapcount(struct page *head) return atomic_read(compound_mapcount_ptr(head)) + 1; } +/* + * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, + * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves subpages_mapcount at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) + /* * Number of sub-pages mapped by PTE, does not include compound mapcount. * Must be called only on head of compound page. */ static inline int head_subpages_mapcount(struct page *head) { - return atomic_read(subpages_mapcount_ptr(head)); + return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; } /* @@ -902,8 +911,12 @@ static inline int total_mapcount(struct page *page) static inline bool folio_large_is_mapped(struct folio *folio) { - return atomic_read(folio_mapcount_ptr(folio)) + - atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; + /* + * Reading folio_mapcount_ptr() below could be omitted if hugetlb + * participated in incrementing subpages_mapcount when compound mapped. + */ + return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + atomic_read(folio_mapcount_ptr(folio)) >= 0; } /** diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5dadb9a3e010..bd3504d11b15 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,15 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -void page_dup_compound_rmap(struct page *page); +static inline void __page_dup_rmap(struct page *page, bool compound) +{ + atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); +} static inline void page_dup_file_rmap(struct page *page, bool compound) { - /* Is page being mapped by PTE? */ - if (likely(!compound)) - atomic_inc(&page->_mapcount); - else - page_dup_compound_rmap(page); + __page_dup_rmap(page, compound); } /** @@ -261,7 +260,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * the page R/O into both processes. */ dup: - page_dup_file_rmap(page, compound); + __page_dup_rmap(page, compound); return 0; } -- cgit From eb309ec89953d6a3e8e35a3a577bab13893858d8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:49 +0100 Subject: mm/mprotect: factor out check whether manual PTE write upgrades are required Let's factor the check out into vma_wants_manual_pte_write_upgrade(), to be reused in NUMA hinting fault context soon. Link: https://lkml.kernel.org/r/20221108174652.198904-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d33639be3db3..e203e8a83e2d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2088,6 +2088,20 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) +{ + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + return vma_wants_writenotify(vma, vma->vm_page_prot); + return !!(vma->vm_flags & VM_WRITE); + +} extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, @@ -2227,8 +2241,6 @@ static inline int pte_devmap(pte_t pte) } #endif -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, -- cgit From 6a56ccbcf6c69538b152644107a1d7383c876ca7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:50 +0100 Subject: mm/autonuma: use can_change_(pte|pmd)_writable() to replace savedwrite commit b191f9b106ea ("mm: numa: preserve PTE write permissions across a NUMA hinting fault") added remembering write permissions using ordinary pte_write() for PROT_NONE mapped pages to avoid write faults when remapping the page !PROT_NONE on NUMA hinting faults. That commit noted: The patch looks hacky but the alternatives looked worse. The tidest was to rewalk the page tables after a hinting fault but it was more complex than this approach and the performance was worse. It's not generally safe to just mark the page writable during the fault if it's a write fault as it may have been read-only for COW so that approach was discarded. Later, commit 288bc54949fc ("mm/autonuma: let architecture override how the write bit should be stashed in a protnone pte.") introduced a family of savedwrite PTE functions that didn't necessarily improve the whole situation. One confusing thing is that nowadays, if a page is pte_protnone() and pte_savedwrite() then also pte_write() is true. Another source of confusion is that there is only a single pte_mk_savedwrite() call in the kernel. All other write-protection code seems to silently rely on pte_wrprotect(). Ever since PageAnonExclusive was introduced and we started using it in mprotect context via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection"), we do have machinery in place to avoid write faults when changing protection, which is exactly what we want to do here. Let's similarly do what ordinary mprotect() does nowadays when upgrading write permissions and reuse can_change_pte_writable() and can_change_pmd_writable() to detect if we can upgrade PTE permissions to be writable. For anonymous pages there should be absolutely no change: if an anonymous page is not exclusive, it could not have been mapped writable -- because only exclusive anonymous pages can be mapped writable. However, there *might* be a change for writable shared mappings that require writenotify: if they are not dirty, we cannot map them writable. While it might not matter in practice, we'd need a different way to identify whether writenotify is actually required -- and ordinary mprotect would benefit from that as well. Note that we don't optimize for the actual migration case: (1) When migration succeeds the new PTE will not be writable because the source PTE was not writable (protnone); in the future we might just optimize that case similarly by reusing can_change_pte_writable()/can_change_pmd_writable() when removing migration PTEs. (2) When migration fails, we'd have to recalculate the "writable" flag because we temporarily dropped the PT lock; for now keep it simple and set "writable=false". We'll remove all savedwrite leftovers next. Link: https://lkml.kernel.org/r/20221108174652.198904-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e203e8a83e2d..8597ef676fc3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2102,6 +2102,8 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma return !!(vma->vm_flags & VM_WRITE); } +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, -- cgit From d6379159f47630813f06f97535cc82ce7b9eed49 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:51 +0100 Subject: mm: remove unused savedwrite infrastructure NUMA hinting no longer uses savedwrite, let's rip it out. ... and while at it, drop __pte_write() and __pmd_write() on ppc64. Link: https://lkml.kernel.org/r/20221108174652.198904-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f0d7d0b9471..c74cce67eec8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -503,30 +503,6 @@ static inline pte_t pte_sw_mkyoung(pte_t pte) #define pte_sw_mkyoung pte_sw_mkyoung #endif -#ifndef pte_savedwrite -#define pte_savedwrite pte_write -#endif - -#ifndef pte_mk_savedwrite -#define pte_mk_savedwrite pte_mkwrite -#endif - -#ifndef pte_clear_savedwrite -#define pte_clear_savedwrite pte_wrprotect -#endif - -#ifndef pmd_savedwrite -#define pmd_savedwrite pmd_write -#endif - -#ifndef pmd_mk_savedwrite -#define pmd_mk_savedwrite pmd_mkwrite -#endif - -#ifndef pmd_clear_savedwrite -#define pmd_clear_savedwrite pmd_wrprotect -#endif - #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, -- cgit From 70fb4fdff5826a48886152fd5c5db04eb6c59a40 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:48 -0800 Subject: mm: introduce 'encoded' page pointers with embedded extra bits We already have this notion in parts of the MM code (see the mlock code with the LRU_PAGE and NEW_PAGE bits), but I'm going to introduce a new case, and I refuse to do the same thing we've done before where we just put bits in the raw pointer and say it's still a normal pointer. So this introduces a 'struct encoded_page' pointer that cannot be used for anything else than to encode a real page pointer and a couple of extra bits in the low bits. That way the compiler can trivially track the state of the pointer and you just explicitly encode and decode the extra bits. Note that this makes the alignment of 'struct page' explicit even for the case where CONFIG_HAVE_ALIGNED_STRUCT_PAGE is not set. That is entirely redundant in almost all cases, since the page structure already contains several word-sized entries. However, on m68k, the alignment of even 32-bit data is just 16 bits, and as such in theory the alignment of 'struct page' could be too. So let's just make it very very explicit that the alignment needs to be at least 32 bits, giving us a guarantee of two unused low bits in the pointer. Now, in practice, our page struct array is aligned much more than that anyway, even on m68k, and our existing code in mm/mlock.c obviously already depended on that. But since the whole point of this change is to be careful about the type system when hiding extra bits in the pointer, let's also be explicit about the assumptions we make. NOTE! This is being very careful in another way too: it has a build-time assertion that the 'flags' added to the page pointer actually fit in the two bits. That means that this helper must be inlined, and can only be used in contexts where the compiler can statically determine that the value fits in the available bits. [akpm@linux-foundation.org: kerneldoc on a forward-declared struct confuses htmldocs] Link: https://lore.kernel.org/all/Y2tKixpO4RO6DgW5@tuxmaker.boeblingen.de.ibm.com/ Link: https://lkml.kernel.org/r/20221109203051.1835763-1-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Reviewed-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: Heiko Carstens [s390] Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 44a1a699b5ad..6b0009e7d4ae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -68,7 +68,7 @@ struct mem_cgroup; #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else -#define _struct_page_alignment +#define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { @@ -251,6 +251,38 @@ struct page { #endif } _struct_page_alignment; +/* + * struct encoded_page - a nonexistent type marking this pointer + * + * An 'encoded_page' pointer is a pointer to a regular 'struct page', but + * with the low bits of the pointer indicating extra context-dependent + * information. Not super-common, but happens in mmu_gather and mlock + * handling, and this acts as a type system check on that use. + * + * We only really have two guaranteed bits in general, although you could + * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + * for more. + * + * Use the supplied helper functions to endcode/decode the pointer and bits. + */ +struct encoded_page; +#define ENCODE_PAGE_BITS 3ul +static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) +{ + BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + return (struct encoded_page *)(flags | (unsigned long)page); +} + +static inline unsigned long encoded_page_flags(struct encoded_page *page) +{ + return ENCODE_PAGE_BITS & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); +} + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. -- cgit From 449c796768c9a1c738d1fa8671fb01663380b8a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:49 -0800 Subject: mm: teach release_pages() to take an array of encoded page pointers too release_pages() already could take either an array of page pointers, or an array of folio pointers. Expand it to also accept an array of encoded page pointers, which is what both the existing mlock() use and the upcoming mmu_gather use of encoded page pointers wants. Note that release_pages() won't actually use, or react to, any extra encoded bits. Instead, this is very much a case of "I have walked the array of encoded pages and done everything the extra bits tell me to do, now release it all". Also, while the "either page or folio pointers" dual use was handled with a cast of the pointer in "release_folios()", this takes a slightly different approach and uses the "transparent union" attribute to describe the set of arguments to the function: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html which has been supported by gcc forever, but the kernel hasn't used before. That allows us to avoid using various wrappers with casts, and just use the same function regardless of use. Link: https://lkml.kernel.org/r/20221109203051.1835763-2-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8597ef676fc3..f873441303b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1245,7 +1245,24 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } -void release_pages(struct page **pages, int nr); +/** + * release_pages - release an array of pages or folios + * + * This just releases a simple array of multiple pages, and + * accepts various different forms of said page array: either + * a regular old boring array of pages, an array of folios, or + * an array of encoded page pointers. + * + * The transparent union syntax for this kind of "any of these + * argument types" is all kinds of ugly, so look away. + */ +typedef union { + struct page **pages; + struct folio **folios; + struct encoded_page **encoded_pages; +} release_pages_arg __attribute__ ((__transparent_union__)); + +void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. @@ -1261,7 +1278,7 @@ void release_pages(struct page **pages, int nr); */ static inline void folios_put(struct folio **folios, unsigned int nr) { - release_pages((struct page **)folios, nr); + release_pages(folios, nr); } static inline void put_page(struct page *page) -- cgit From 7cc8f9c7146a5c2dad6e71653c4f69972e73df6b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:50 -0800 Subject: mm: mmu_gather: prepare to gather encoded page pointers with flags This is purely a preparatory patch that makes all the data structures ready for encoding flags with the mmu_gather page pointers. The code currently always sets the flag to zero and doesn't use it yet, but now it's tracking the type state along. The next step will be to actually start using it. Link: https://lkml.kernel.org/r/20221109203051.1835763-3-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index fec6647a289a..b61e2007d156 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -463,7 +463,7 @@ static inline unsigned long total_swapcache_pages(void) extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct page **, int); +extern void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; -- cgit From 7c2af309abd24ff4e313246bf9b68f398d95c871 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Wed, 9 Nov 2022 20:50:42 +0900 Subject: zram: add size class equals check into recompression It makes no sense for us to recompress the object if it will be in the same size class. We anyway don't get any memory gain. But, at the same time, we get a CPU time overhead when inserting this object into zspage and decompressing it afterwards. [senozhatsky: rebased and fixed conflicts] Link: https://lkml.kernel.org/r/20221109115047.2921851-9-senozhatsky@chromium.org Signed-off-by: Alexey Romanov Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- include/linux/zsmalloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2a430e713ce5..a48cd0ffe57d 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -55,5 +55,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); + void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif -- cgit From b7217a0bbe00a98a8f4b15ebc2a8355a31a59e1e Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Mon, 14 Nov 2022 23:59:49 +0000 Subject: mm: shrinkers: add missing includes for undeclared types The shrinker.h header depends on a user including other headers before it for types used by shrinker.h. Fix this by including the appropriate headers in shrinker.h. ./include/linux/shrinker.h:13:9: error: unknown type name `gfp_t' 13 | gfp_t gfp_mask; | ^~~~~ ./include/linux/shrinker.h:71:26: error: field `list' has incomplete type 71 | struct list_head list; | ^~~~ ./include/linux/shrinker.h:82:9: error: unknown type name `atomic_long_t' 82 | atomic_long_t *nr_deferred; | Link: https://lkml.kernel.org/r/20221114235949.201749-1-tjmercier@google.com Fixes: 83aeeada7c69 ("vmscan: use atomic-long for shrinker batching") Fixes: b0d40c92adaf ("superblock: introduce per-sb cache shrinker infrastructure") Signed-off-by: T.J. Mercier Cc: Al Viro Cc: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 08e6054e061f..71310efe2fab 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -2,6 +2,9 @@ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H +#include +#include + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. -- cgit From d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 15 Nov 2022 02:06:01 +0000 Subject: mm: anonymous shared memory naming Since commit 9a10064f5625 ("mm: add a field to store names for private anonymous memory"), name for private anonymous memory, but not shared anonymous, can be set. However, naming shared anonymous memory just as useful for tracking purposes. Extend the functionality to be able to set names for shared anon. There are two ways to create anonymous shared memory, using memfd or directly via mmap(): 1. fd = memfd_create(...) mem = mmap(..., MAP_SHARED, fd, ...) 2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...) In both cases the anonymous shared memory is created the same way by mapping an unlinked file on tmpfs. The memfd way allows to give a name for anonymous shared memory, but not useful when parts of shared memory require to have distinct names. Example use case: The VMM maps VM memory as anonymous shared memory (not private because VMM is sandboxed and drivers are running in their own processes). However, the VM tells back to the VMM how parts of the memory are actually used by the guest, how each of the segments should be backed (i.e. 4K pages, 2M pages), and some other information about the segments. The naming allows us to monitor the effective memory footprint for each of these segments from the host without looking inside the guest. Sample output: /* Create shared anonymous segmenet */ anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); /* Name the segment: "MY-NAME" */ rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, anon_shmem, SIZE, "MY-NAME"); cat /proc//maps (and smaps): 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME] If the segment is not named, the output is: 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted) Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Bagas Sanjaya Cc: Colin Cross Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Peter Xu Cc: Sean Christopherson Cc: Vincent Whitchurch Cc: Vlastimil Babka Cc: xu xin Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ include/linux/mm_types.h | 26 ++++++++++++-------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f873441303b7..686879dbb0bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,8 +700,10 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); +bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b0009e7d4ae..157c2e22cc7f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -549,21 +549,11 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * - * For private anonymous mappings, a pointer to a null terminated string - * containing the name given to the vma, or NULL if unnamed. */ - - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* - * Serialized by mmap_sem. Never use directly because it is - * valid only when vm_file is NULL. Use anon_vma_name instead. - */ - struct anon_vma_name *anon_name; - }; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -584,6 +574,14 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_sem. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif -- cgit From 8d6a0ac09a16c026e1e2a03a61e12e95c48a25a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:47 +0100 Subject: mm: extend FAULT_FLAG_UNSHARE support to anything in a COW mapping Extend FAULT_FLAG_UNSHARE to break COW on anything mapped into a COW (i.e., private writable) mapping and adjust the documentation accordingly. FAULT_FLAG_UNSHARE will now also break COW when encountering the shared zeropage, a pagecache page, a PFNMAP, ... inside a COW mapping, by properly replacing the mapped page/pfn by a private copy (an exclusive anonymous page). Note that only do_wp_page() needs care: hugetlb_wp() already handles FAULT_FLAG_UNSHARE correctly. wp_huge_pmd()/wp_huge_pud() also handles it correctly, for example, splitting the huge zeropage on FAULT_FLAG_UNSHARE such that we can handle FAULT_FLAG_UNSHARE on the PTE level. This change is a requirement for reliable long-term R/O pinning in COW mappings. Link: https://lkml.kernel.org/r/20221116102659.70287-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 157c2e22cc7f..018b1c098173 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1039,9 +1039,9 @@ typedef struct { * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark - * exclusive) a possibly shared anonymous page that is - * mapped R/O. + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a + * COW mapping, making sure that an exclusive anon page is + * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @@ -1066,7 +1066,7 @@ typedef struct { * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when - * no existing R/O-mapped anonymous page is encountered. + * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, -- cgit From 84209e87c6963f928194a890399e24e8ad299db1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:48 +0100 Subject: mm/gup: reliable R/O long-term pinning in COW mappings We already support reliable R/O pinning of anonymous memory. However, assume we end up pinning (R/O long-term) a pagecache page or the shared zeropage inside a writable private ("COW") mapping. The next write access will trigger a write-fault and replace the pinned page by an exclusive anonymous page in the process page tables to break COW: the pinned page no longer corresponds to the page mapped into the process' page table. Now that FAULT_FLAG_UNSHARE can break COW on anything mapped into a COW mapping, let's properly break COW first before R/O long-term pinning something that's not an exclusive anon page inside a COW mapping. FAULT_FLAG_UNSHARE will break COW and map an exclusive anon page instead that can get pinned safely. With this change, we can stop using FOLL_FORCE|FOLL_WRITE for reliable R/O long-term pinning in COW mappings. With this change, the new R/O long-term pinning tests for non-anonymous memory succeed: # [RUN] R/O longterm GUP pin ... with shared zeropage ok 151 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd ok 152 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with tmpfile ok 153 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with huge zeropage ok 154 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (2048 kB) ok 155 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (1048576 kB) ok 156 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with shared zeropage ok 157 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd ok 158 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with tmpfile ok 159 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with huge zeropage ok 160 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (2048 kB) ok 161 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (1048576 kB) ok 162 Longterm R/O pin is reliable Note 1: We don't care about short-term R/O-pinning, because they have snapshot semantics: they are not supposed to observe modifications that happen after pinning. As one example, assume we start direct I/O to read from a page and store page content into a file: modifications to page content after starting direct I/O are not guaranteed to end up in the file. So even if we'd pin the shared zeropage, the end result would be as expected -- getting zeroes stored to the file. Note 2: For shared mappings we'll now always fallback to the slow path to lookup the VMA when R/O long-term pining. While that's the necessary price we have to pay right now, it's actually not that bad in practice: most FOLL_LONGTERM users already specify FOLL_WRITE, for example, along with FOLL_FORCE because they tried dealing with COW mappings correctly ... Note 3: For users that use FOLL_LONGTERM right now without FOLL_WRITE, such as VFIO, we'd now no longer pin the shared zeropage. Instead, we'd populate exclusive anon pages that we can pin. There was a concern that this could affect the memlock limit of existing setups. For example, a VM running with VFIO could run into the memlock limit and fail to run. However, we essentially had the same behavior already in commit 17839856fd58 ("gup: document and work around "COW can break either way" issue") which got merged into some enterprise distros, and there were not any such complaints. So most probably, we're fine. Link: https://lkml.kernel.org/r/20221116102659.70287-10-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Daniel Vetter Reviewed-by: Vlastimil Babka Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 686879dbb0bd..d8363ac34a7c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3149,8 +3149,12 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. */ -static inline bool gup_must_unshare(unsigned int flags, struct page *page) +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry @@ -3163,8 +3167,25 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) * Note: PageAnon(page) is stable until the page is actually getting * freed. */ - if (!PageAnon(page)) - return false; + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } /* Paired with a memory barrier in page_try_share_anon_rmap(). */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) -- cgit From 7438899b0b8df16a73d9a5d49b2a345d165adfe8 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:55 -0800 Subject: folio-compat: remove try_to_release_page() There are no more callers of try_to_release_page(), so remove it. This saves 85 bytes of kernel text. Link: https://lkml.kernel.org/r/20221118073055.55694-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b33ab86d5dca..2ec0ca1f3d38 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1105,7 +1105,6 @@ void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); -int try_to_release_page(struct page *page, gfp_t gfp); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); -- cgit From 8e9d5ead865a1a7af74a444d2f00f1ef4539bfba Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:56 -0800 Subject: mm: add bdi_set_strict_limit() function Patch series "mm/block: add bdi sysfs knobs", v4. At meta network block devices (nbd) are used to implement remote block storage. In testing and during production it has been observed that these network block devices can consume a huge portion of the dirty writeback cache and writeback can take a considerable time. To be able to give stricter limits, I'm proposing the following changes: 1) introduce strictlimit knob Currently the max_ratio knob exists to limit the dirty_memory. However this knob only applies once (dirty_ratio + dirty_background_ratio) / 2 has been reached. With the BDI_CAP_STRICTLIMIT flag, the max_ratio can be applied without reaching that limit. This change exposes that knob. This knob can also be useful for NFS, fuse filesystems and USB devices. 2) Use part of 1000000 internal calculation The max_ratio is based on percentage. With the current machine sizes percentage values can be very high (1% of a 256GB main memory is already 2.5GB). This change uses part of 1000000 instead of percentages for the internal calculations. 3) Introduce two new sysfs knobs: min_bytes and max_bytes. Currently all calculations are based on ratio, but for a user it often more convenient to specify a limit in bytes. The new knobs will not store bytes values, instead they will translate the byte value to a corresponding ratio. As the internal values are now part of 1000, the ratio is closer to the specified value. However the value should be more seen as an approximation as it can fluctuate over time. 3) Introduce two new sysfs knobs: min_ratio_fine and max_ratio_fine. The granularity for the existing sysfs bdi knobs min_ratio and max_ratio is based on percentage values. The new sysfs bdi knobs min_ratio_fine and max_ratio_fine allow to specify the ratio as part of 1 million. This patch (of 20): This adds the bdi_set_strict_limit function to be able to set/unset the BDI_CAP_STRICTLIMIT flag. Link: https://lkml.kernel.org/r/20221119005215.3052436-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20221119005215.3052436-2-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Jens Axboe Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 439815cc1ab9..9c984ffc8a0a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -104,6 +104,7 @@ static inline unsigned long wb_stat_error(void) int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability -- cgit From ae82291e9ca47c3d6da6b77a00f427754aca413e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:59 -0800 Subject: mm: use part per 1000000 for bdi ratios To get finer granularity for ratio calculations use part per million instead of percentiles. This is especially important if we want to automatically convert byte values to ratios. Otherwise the values that are actually used can be quite different. This is also important for machines with more main memory (1% of 256GB is already 2.5GB). Link: https://lkml.kernel.org/r/20221119005215.3052436-5-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9c984ffc8a0a..1b50c028e5ad 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -102,6 +102,9 @@ static inline unsigned long wb_stat_error(void) #endif } +/* BDI ratio is expressed as part per 1000000 for finer granularity. */ +#define BDI_RATIO_SCALE 10000 + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); -- cgit From 00df7d51263b46ed93f7572e2d09579746f7b1eb Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:00 -0800 Subject: mm: add bdi_get_max_bytes() function This adds a function to return the specified value for max_bytes. It converts the stored max_ratio of the bdi to the corresponding bytes value. It introduces the bdi_get_bytes helper function to do the conversion. This is an approximation as it is based on the value that is returned by global_dirty_limits(), which can change. The helper function will also be used by the min_bytes bdi knob. Link: https://lkml.kernel.org/r/20221119005215.3052436-6-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1b50c028e5ad..473686c32775 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -105,6 +105,7 @@ static inline unsigned long wb_stat_error(void) /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 +u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); -- cgit From 1bf27e98d26d1e62166a456ef17460be085cbe0b Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:02 -0800 Subject: mm: add bdi_set_max_bytes() function This introduces the bdi_set_max_bytes() function. The max_bytes function does not store the max_bytes value. Instead it converts the max_bytes value into the corresponding ratio value. Link: https://lkml.kernel.org/r/20221119005215.3052436-8-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 473686c32775..ea6c993433d5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -108,6 +108,7 @@ static inline unsigned long wb_stat_error(void) u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* -- cgit From 712c00d66a342a3ed375df41c3df7d3d2abad2c0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:05 -0800 Subject: mm: add bdi_get_min_bytes() function This adds a function to return the specified value for min_bytes. It converts the stored min_ratio of the bdi to the corresponding bytes value. This is an approximation as it is based on the value that is returned by global_dirty_limits(), which can change. The returned value can be different than the value when the min_bytes value was set. Link: https://lkml.kernel.org/r/20221119005215.3052436-11-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ea6c993433d5..8e04567727e6 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -105,6 +105,7 @@ static inline unsigned long wb_stat_error(void) /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 +u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); -- cgit From 803c98050569850be5fd51a2025c67622de887d9 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:07 -0800 Subject: mm: add bdi_set_min_bytes() function This introduces the bdi_set_min_bytes() function. The min_bytes function does not store the min_bytes value. Instead it converts the min_bytes value into the corresponding ratio value. Link: https://lkml.kernel.org/r/20221119005215.3052436-13-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8e04567727e6..572669758c7f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); -- cgit From 4e230b406eda9bdf7f8a71e2cc3df18a824abcb0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:10 -0800 Subject: mm: add bdi_set_max_ratio_no_scale() function This introduces bdi_set_max_ratio_no_scale(). It uses the max granularity for the ratio. This function by the new sysfs knob max_ratio_fine. Link: https://lkml.kernel.org/r/20221119005215.3052436-16-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 572669758c7f..d9acbb22ff25 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); -- cgit From 2c44af4f2aaa260199f218f11920c406e688693c Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:13 -0800 Subject: mm: add bdi_set_min_ratio_no_scale() function This introduces bdi_set_min_ratio_no_scale(). It uses the max granularity for the ratio. This function by the new sysfs knob min_ratio_fine. Link: https://lkml.kernel.org/r/20221119005215.3052436-19-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d9acbb22ff25..fbad4fcd408e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); -- cgit From 373dfda2bac1173971099dc76254a016646f62ab Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Nov 2022 18:46:41 +0530 Subject: mm/thp: rename pmd_to_page() as pmd_pgtable_page() Current pmd_to_page(), which derives the page table page containing the pmd address has a very misleading name. The problem being, it sounds similar to pmd_page() which derives page embedded in a given pmd entry either for next level page or a mapped huge page. Rename it as pmd_pgtable_page() instead. Link: https://lkml.kernel.org/r/20221124131641.1523772-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d8363ac34a7c..2c73dc112ffc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2510,7 +2510,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_to_page(pmd_t *pmd) +static struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); @@ -2518,7 +2518,7 @@ static struct page *pmd_to_page(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_to_page(pmd)); + return ptlock_ptr(pmd_pgtable_page(pmd)); } static inline bool pmd_ptlock_init(struct page *page) @@ -2537,7 +2537,7 @@ static inline void pmd_ptlock_free(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) #else -- cgit From 7e25de77bc5ea56cc3ff618fc8f4ea1896a4dbb3 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 25 Nov 2022 09:15:02 +0530 Subject: s390/mm: use pmd_pgtable_page() helper in __gmap_segment_gaddr() In __gmap_segment_gaddr() pmd level page table page is being extracted from the pmd pointer, similar to pmd_pgtable_page() implementation. This reduces some redundancy by directly using pmd_pgtable_page() instead, though first making it available. Link: https://lkml.kernel.org/r/20221125034502.1559986-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev Cc: Christian Borntraeger Cc: David Hildenbrand Cc: Heiko Carstens Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c73dc112ffc..8df5cae69c80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2510,7 +2510,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_pgtable_page(pmd_t *pmd) +static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); -- cgit From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:39 +0800 Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS flag from the msg->flags. If apply_bytes is used and it is larger than the current data being processed, sk_psock_msg_verdict() will not be called when sendmsg() is called again. At this time, the msg->flags is 0, and we lost the BPF_F_INGRESS flag. So we need to save the BPF_F_INGRESS flag in sk_psock and use it when redirection. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com --- include/linux/skmsg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 70d6cb94e580..84f787416a54 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -82,6 +82,7 @@ struct sk_psock { u32 apply_bytes; u32 cork_bytes; u32 eval; + bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) -- cgit From 2e41f274f9aa71cdcc69dc1f26a3f9304a651804 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:16 +0900 Subject: libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value Patch series "fix error when writing negative value to simple attribute files". The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), but some attribute files want to accept a negative value. This patch (of 3): The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value. This adds DEFINE_SIMPLE_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-1-akinobu.mita@gmail.com Link: https://lkml.kernel.org/r/20220919172418.45257-2-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- include/linux/fs.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..452700c5fa1d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3485,7 +3485,7 @@ void simple_transaction_set(struct file *file, size_t n); * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ -#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -3496,10 +3496,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ - .write = simple_attr_write, \ + .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } +#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { @@ -3514,6 +3520,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); -- cgit From d472cf797c4e268613dbce5ec9b95d0bcae19ecb Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:18 +0900 Subject: debugfs: fix error when writing negative value to atomic_t debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value for a debugfs file created by debugfs_create_atomic_t(). This restores the previous behaviour by introducing DEFINE_DEBUGFS_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-4-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- include/linux/debugfs.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index f60674692d36..ea2d919fd9c7 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,7 +45,7 @@ struct debugfs_u32_array { extern struct dentry *arch_debugfs_dir; -#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -56,10 +56,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = debugfs_attr_read, \ - .write = debugfs_attr_write, \ + .write = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write, \ .llseek = no_llseek, \ } +#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); #if defined(CONFIG_DEBUG_FS) @@ -102,6 +108,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name); @@ -254,6 +262,13 @@ static inline ssize_t debugfs_attr_write(struct file *file, return -ENODEV; } +static inline ssize_t debugfs_attr_write_signed(struct file *file, + const char __user *buf, + size_t len, loff_t *ppos) +{ + return -ENODEV; +} + static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, char *new_name) { -- cgit From de985c109096b236d43e98da0c65376bf3bc24fb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 13 Nov 2022 20:08:02 +0900 Subject: linux/init.h: include and With CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y, the following code fails to build: ---------------->8---------------- #include int foo(void) { return 0; } core_initcall(foo); ---------------->8---------------- Include for static_assert() and for __stringify(). Link: https://lkml.kernel.org/r/20221113110802.3760705-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Cc: Jiangshan Yi Cc: Kees Cook Cc: Peter Zijlstra Cc: Randy Dunlap # build-tested Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- include/linux/init.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 2e96756fe1ff..c5fe6d26f5b1 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -2,7 +2,9 @@ #ifndef _LINUX_INIT_H #define _LINUX_INIT_H +#include #include +#include #include /* Built-in __init functions needn't be compiled with retpoline */ -- cgit From 2ff4bed7fee72ba1abfcff5f11ae8f8e570353f2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:29 -0400 Subject: iommufd: File descriptor, context, kconfig and makefiles This is the basic infrastructure of a new miscdevice to hold the iommufd IOCTL API. It provides: - A miscdevice to create file descriptors to run the IOCTL interface over - A table based ioctl dispatch and centralized extendable pre-validation step - An xarray mapping userspace ID's to kernel objects. The design has multiple inter-related objects held within in a single IOMMUFD fd - A simple usage count to build a graph of object relations and protect against hostile userspace racing ioctls The only IOCTL provided in this patch is the generic 'destroy any object by handle' operation. Link: https://lore.kernel.org/r/6-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/linux/iommufd.h (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h new file mode 100644 index 000000000000..d1817472c273 --- /dev/null +++ b/include/linux/iommufd.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __LINUX_IOMMUFD_H +#define __LINUX_IOMMUFD_H + +#include +#include +#include + +struct iommufd_ctx; +struct file; + +void iommufd_ctx_get(struct iommufd_ctx *ictx); + +#if IS_ENABLED(CONFIG_IOMMUFD) +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); +void iommufd_ctx_put(struct iommufd_ctx *ictx); +#else /* !CONFIG_IOMMUFD */ +static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ +} +#endif /* CONFIG_IOMMUFD */ +#endif -- cgit From ce5a23c835aa0f0a931b5bcde1e7811f951b0146 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:30 -0400 Subject: kernel/user: Allow user_struct::locked_vm to be usable for iommufd Following the pattern of io_uring, perf, skb, and bpf, iommfd will use user->locked_vm for accounting pinned pages. Ensure the value is included in the struct and export free_uid() as iommufd is modular. user->locked_vm is the good accounting to use for ulimit because it is per-user, and the security sandboxing of locked pages is not supposed to be per-process. Other places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or mm->locked_vm for accounting pinned pages, but this is only per-process and inconsistent with the new FOLL_LONGTERM users in the kernel. Concurrent work is underway to try to put this in a cgroup, so everything can be consistent and the kernel can provide a FOLL_LONGTERM limit that actually provides security. Link: https://lore.kernel.org/r/7-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/sched/user.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index f054d0360a75..4cc52698e214 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -25,7 +25,7 @@ struct user_struct { #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \ - defined(CONFIG_VFIO_PCI_ZDEV_KVM) + defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD) atomic_long_t locked_vm; #endif #ifdef CONFIG_WATCH_QUEUE -- cgit From f394576eb11dbcd3a740fa41e577b97f0720d26e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:31 -0400 Subject: iommufd: PFN handling for iopt_pages The top of the data structure provides an IO Address Space (IOAS) that is similar to a VFIO container. The IOAS allows map/unmap of memory into ranges of IOVA called iopt_areas. Multiple IOMMU domains (IO page tables) and in-kernel accesses (like VFIO mdevs) can be attached to the IOAS to access the PFNs that those IOVA areas cover. The IO Address Space (IOAS) datastructure is composed of: - struct io_pagetable holding the IOVA map - struct iopt_areas representing populated portions of IOVA - struct iopt_pages representing the storage of PFNs - struct iommu_domain representing each IO page table in the system IOMMU - struct iopt_pages_access representing in-kernel accesses of PFNs (ie VFIO mdevs) - struct xarray pinned_pfns holding a list of pages pinned by in-kernel accesses This patch introduces the lowest part of the datastructure - the movement of PFNs in a tiered storage scheme: 1) iopt_pages::pinned_pfns xarray 2) Multiple iommu_domains 3) The origin of the PFNs, i.e. the userspace pointer PFN have to be copied between all combinations of tiers, depending on the configuration. The interface is an iterator called a 'pfn_reader' which determines which tier each PFN is stored and loads it into a list of PFNs held in a struct pfn_batch. Each step of the iterator will fill up the pfn_batch, then the caller can use the pfn_batch to send the PFNs to the required destination. Repeating this loop will read all the PFNs in an IOVA range. The pfn_reader and pfn_batch also keep track of the pinned page accounting. While PFNs are always stored and accessed as full PAGE_SIZE units the iommu_domain tier can store with a sub-page offset/length to support IOMMUs with a smaller IOPTE size than PAGE_SIZE. Link: https://lore.kernel.org/r/8-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index d1817472c273..26e09d539737 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -13,6 +13,13 @@ struct iommufd_ctx; struct file; +enum { + IOMMUFD_ACCESS_RW_READ = 0, + IOMMUFD_ACCESS_RW_WRITE = 1 << 0, + /* Set if the caller is in a kthread then rw will use kthread_use_mm() */ + IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, +}; + void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) -- cgit From e8d57210035b6377d424ba964961892d01127cf6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:36 -0400 Subject: iommufd: Add kAPI toward external drivers for physical devices Add the four functions external drivers need to connect physical DMA to the IOMMUFD: iommufd_device_bind() / iommufd_device_unbind() Register the device with iommufd and establish security isolation. iommufd_device_attach() / iommufd_device_detach() Connect a bound device to a page table Binding a device creates a device object ID in the uAPI, however the generic API does not yet provide any IOCTLs to manipulate them. Link: https://lore.kernel.org/r/13-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 26e09d539737..185dff3eb32f 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,10 +9,19 @@ #include #include #include +#include +struct iommufd_device; struct iommufd_ctx; struct file; +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id); +void iommufd_device_unbind(struct iommufd_device *idev); + +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev); + enum { IOMMUFD_ACCESS_RW_READ = 0, IOMMUFD_ACCESS_RW_WRITE = 1 << 0, -- cgit From 8d40205f6093f18e07fe3dc5920fc85e9f82b8b3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:37 -0400 Subject: iommufd: Add kAPI toward external drivers for kernel access Kernel access is the mode that VFIO "mdevs" use. In this case there is no struct device and no IOMMU connection. iommufd acts as a record keeper for accesses and returns the actual struct pages back to the caller to use however they need. eg with kmap or the DMA API. Each caller must create a struct iommufd_access with iommufd_access_create(), similar to how iommufd_device_bind() works. Using this struct the caller can access blocks of IOVA using iommufd_access_pin_pages() or iommufd_access_rw(). Callers must provide a callback that immediately unpins any IOVA being used within a range. This happens if userspace unmaps the IOVA under the pin. The implementation forwards the access requests directly to the iopt infrastructure that manages the iopt_pages_access. Link: https://lore.kernel.org/r/14-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 185dff3eb32f..46c481a26d79 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,10 +9,12 @@ #include #include #include -#include +struct device; struct iommufd_device; +struct page; struct iommufd_ctx; +struct iommufd_access; struct file; struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, @@ -22,6 +24,11 @@ void iommufd_device_unbind(struct iommufd_device *idev); int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev); +struct iommufd_access_ops { + u8 needs_pin_pages : 1; + void (*unmap)(void *data, unsigned long iova, unsigned long length); +}; + enum { IOMMUFD_ACCESS_RW_READ = 0, IOMMUFD_ACCESS_RW_WRITE = 1 << 0, @@ -29,11 +36,24 @@ enum { IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, }; +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id, + const struct iommufd_access_ops *ops, void *data); +void iommufd_access_destroy(struct iommufd_access *access); + void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); void iommufd_ctx_put(struct iommufd_ctx *ictx); + +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags); +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length); +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t len, unsigned int flags); #else /* !CONFIG_IOMMUFD */ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { @@ -43,5 +63,26 @@ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) static inline void iommufd_ctx_put(struct iommufd_ctx *ictx) { } + +static inline int iommufd_access_pin_pages(struct iommufd_access *access, + unsigned long iova, + unsigned long length, + struct page **out_pages, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + +static inline void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, + unsigned long length) +{ +} + +static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t len, unsigned int flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD */ #endif -- cgit From d624d6652a65ad4f47a58b8651a1ec1163bb81d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:38 -0400 Subject: iommufd: vfio container FD ioctl compatibility iommufd can directly implement the /dev/vfio/vfio container IOCTLs by mapping them into io_pagetable operations. A userspace application can test against iommufd and confirm compatibility then simply make a small change to open /dev/iommu instead of /dev/vfio/vfio. For testing purposes /dev/vfio/vfio can be symlinked to /dev/iommu and then all applications will use the compatibility path with no code changes. A later series allows /dev/vfio/vfio to be directly provided by iommufd, which allows the rlimit mode to work the same as well. This series just provides the iommufd side of compatibility. Actually linking this to VFIO_SET_CONTAINER is a followup series, with a link in the cover letter. Internally the compatibility API uses a normal IOAS object that, like vfio, is automatically allocated when the first device is attached. Userspace can also query or set this IOAS object directly using the IOMMU_VFIO_IOAS ioctl. This allows mixing and matching new iommufd only features while still using the VFIO style map/unmap ioctls. While this is enough to operate qemu, it has a few differences: - Resource limits rely on memory cgroups to bound what userspace can do instead of the module parameter dma_entry_limit. - VFIO P2P is not implemented. The DMABUF patches for vfio are a start at a solution where iommufd would import a special DMABUF. This is to avoid further propogating the follow_pfn() security problem. - A full audit for pedantic compatibility details (eg errnos, etc) has not yet been done - powerpc SPAPR is left out, as it is not connected to the iommu_domain framework. It seems interest in SPAPR is minimal as it is currently non-working in v6.1-rc1. They will have to convert to the iommu subsystem framework to enjoy iommfd. The following are not going to be implemented and we expect to remove them from VFIO type1: - SW access 'dirty tracking'. As discussed in the cover letter this will be done in VFIO. - VFIO_TYPE1_NESTING_IOMMU https://lore.kernel.org/all/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - VFIO_DMA_MAP_FLAG_VADDR https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ Link: https://lore.kernel.org/r/15-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 46c481a26d79..84af9a239769 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -54,6 +54,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length); int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, void *data, size_t len, unsigned int flags); +int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id); #else /* !CONFIG_IOMMUFD */ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { @@ -84,5 +85,11 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long { return -EOPNOTSUPP; } + +static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, + u32 *out_ioas_id) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD */ #endif -- cgit From f4b20bb34c83dceade5470288f48f94ce3598ada Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:39 -0400 Subject: iommufd: Add kernel support for testing iommufd Provide a mock kernel module for the iommu_domain that allows it to run without any HW and the mocking provides a way to directly validate that the PFNs loaded into the iommu_domain are correct. This exposes the access kAPI toward userspace to allow userspace to explore the functionality of pages.c and io_pagetable.c The mock also simulates the rare case of PAGE_SIZE > iommu page size as the mock will operate at a 2K iommu page size. This allows exercising all of the calculations to support this mismatch. This is also intended to support syzkaller exploring the same space. However, it is an unusually invasive config option to enable all of this. The config option should not be enabled in a production kernel. Link: https://lore.kernel.org/r/16-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Matthew Rosato # s390 Tested-by: Eric Auger # aarch64 Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 84af9a239769..650d45629647 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,9 @@ enum { IOMMUFD_ACCESS_RW_WRITE = 1 << 0, /* Set if the caller is in a kthread then rw will use kthread_use_mm() */ IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, + + /* Only for use by selftest */ + __IOMMUFD_ACCESS_RW_SLOW_PATH = 1 << 2, }; struct iommufd_access * -- cgit From 3144bfa5078e0df7507a4de72061501e6a0e56be Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Nov 2022 21:21:47 -0800 Subject: bpf: Fix a compilation failure with clang lto build When building the kernel with clang lto (CONFIG_LTO_CLANG_FULL=y), the following compilation error will appear: $ make LLVM=1 LLVM_IAS=1 -j ... ld.lld: error: ld-temp.o :26889:1: symbol 'cgroup_storage_map_btf_ids' is already defined cgroup_storage_map_btf_ids:; ^ make[1]: *** [/.../bpf-next/scripts/Makefile.vmlinux_o:61: vmlinux.o] Error 1 In local_storage.c, we have BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) Commit c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") added the above identical BTF_ID_LIST_SINGLE definition in bpf_cgrp_storage.c. With duplicated definitions, llvm linker complains with lto build. Also, extracting btf_id of 'struct bpf_local_storage_map' is defined four times for sk, inode, task and cgrp local storages. Let us define a single global one with a different name than cgroup_storage_map_btf_ids, which also fixed the lto compilation error. Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221130052147.1591625-1-yhs@fb.com --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93397711a68c..3a4f7cd882ca 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE, extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; +extern u32 bpf_local_storage_map_btf_id[]; #endif -- cgit From bc66fa87d4fda9053a8145e5718fc278c2b88253 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Wed, 30 Nov 2022 10:12:16 +0800 Subject: net: phy: Add link between phy dev and mac dev If the external phy used by current mac interface is managed by another mac interface, it means that this network port cannot work independently, especially when the system suspends and resumes, the following trace may appear, so we should create a device link between phy dev and mac dev. WARNING: CPU: 0 PID: 24 at drivers/net/phy/phy.c:983 phy_error+0x20/0x68 Modules linked in: CPU: 0 PID: 24 Comm: kworker/0:2 Not tainted 6.1.0-rc3-00011-g5aaef24b5c6d-dirty #34 Hardware name: Freescale i.MX6 SoloX (Device Tree) Workqueue: events_power_efficient phy_state_machine unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x90 dump_stack_lvl from __warn+0xb4/0x24c __warn from warn_slowpath_fmt+0x5c/0xd8 warn_slowpath_fmt from phy_error+0x20/0x68 phy_error from phy_state_machine+0x22c/0x23c phy_state_machine from process_one_work+0x288/0x744 process_one_work from worker_thread+0x3c/0x500 worker_thread from kthread+0xf0/0x114 kthread from ret_from_fork+0x14/0x28 Exception stack(0xf0951fb0 to 0xf0951ff8) Signed-off-by: Xiaolei Wang Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20221130021216.1052230-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9a3752c0c444..71eeb4e3b1fd 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -529,6 +529,8 @@ struct macsec_ops; * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance + * @devlink: Create a link between phy dev and mac dev, if the external phy + * used by current mac interface is managed by another mac interface. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -618,6 +620,8 @@ struct phy_device { /* And management functions */ struct phy_driver *drv; + struct device_link *devlink; + u32 phy_id; struct phy_c45_device_ids c45_ids; -- cgit From d6c494e8ee932b2b21ff4b718eebb378e91b3da0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 30 Nov 2022 12:53:20 +0100 Subject: vdso/timens: Refactor copy-pasted find_timens_vvar_page() helper into one copy find_timens_vvar_page() is not architecture-specific, as can be seen from how all five per-architecture versions of it are the same. (arm64, powerpc and riscv are exactly the same; x86 and s390 have two characters difference inside a comment, less blank lines, and mark the !CONFIG_TIME_NS version as inline.) Refactor the five copies into a central copy in kernel/time/namespace.c. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221130115320.2918447-1-jannh@google.com --- include/linux/time_namespace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 3146f1c056c9..bb9d3f5542f8 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -45,6 +45,7 @@ struct time_namespace *copy_time_ns(unsigned long flags, void free_time_ns(struct time_namespace *ns); void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); struct vdso_data *arch_get_vdso_data(void *vvar_page); +struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { @@ -141,6 +142,11 @@ static inline void timens_on_fork(struct nsproxy *nsproxy, return; } +static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} + static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } -- cgit From 278ab9793116a8531ffaa52b4e61644971131e35 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 30 Nov 2022 13:26:45 -0800 Subject: wifi: ieee80211: Do not open-code qos address offsets When building with -Wstringop-overflow, GCC's KASAN implementation does not correctly perform bounds checking within some complex structures when faced with literal offsets, and can get very confused. For example, this warning is seen due to literal offsets into sturct ieee80211_hdr that may or may not be large enough: drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function 'iwl_mvm_rx_mpdu_mq': drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:2022:29: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] 2022 | *qc &= ~IEEE80211_QOS_CTL_A_MSDU_PRESENT; In file included from drivers/net/wireless/intel/iwlwifi/mvm/fw-api.h:32, from drivers/net/wireless/intel/iwlwifi/mvm/sta.h:15, from drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:27, from drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:10: drivers/net/wireless/intel/iwlwifi/mvm/../fw/api/rx.h:559:16: note: at offset [78, 166] into destination object 'mpdu_len' of size 2 559 | __le16 mpdu_len; | ^~~~~~~~ Refactor ieee80211_get_qos_ctl() to avoid using literal offsets, requiring the creation of the actual structure that is described in the comments. Explicitly choose the desired offset, making the code more human-readable too. This is one of the last remaining warning to fix before enabling -Wstringop-overflow globally. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97490 Link: https://github.com/KSPP/linux/issues/181 Cc: Johannes Berg Cc: Kalle Valo Cc: Gregory Greenman Cc: "Gustavo A. R. Silva" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221130212641.never.627-kees@kernel.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 6252f02f38b7..80d6308dea06 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -338,6 +338,17 @@ struct ieee80211_qos_hdr { __le16 qos_ctrl; } __packed __aligned(2); +struct ieee80211_qos_hdr_4addr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; + __le16 seq_ctrl; + u8 addr4[ETH_ALEN]; + __le16 qos_ctrl; +} __packed __aligned(2); + struct ieee80211_trigger { __le16 frame_control; __le16 duration; @@ -4060,16 +4071,21 @@ struct ieee80211_he_6ghz_capa { * @hdr: the frame * * The qos ctrl bytes come after the frame_control, duration, seq_num - * and 3 or 4 addresses of length ETH_ALEN. - * 3 addr: 2 + 2 + 2 + 3*6 = 24 - * 4 addr: 2 + 2 + 2 + 4*6 = 30 + * and 3 or 4 addresses of length ETH_ALEN. Checks frame_control to choose + * between struct ieee80211_qos_hdr_4addr and struct ieee80211_qos_hdr. */ static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr) { - if (ieee80211_has_a4(hdr->frame_control)) - return (u8 *)hdr + 30; + union { + struct ieee80211_qos_hdr addr3; + struct ieee80211_qos_hdr_4addr addr4; + } *qos; + + qos = (void *)hdr; + if (ieee80211_has_a4(qos->addr3.frame_control)) + return (u8 *)&qos->addr4.qos_ctrl; else - return (u8 *)hdr + 24; + return (u8 *)&qos->addr3.qos_ctrl; } /** -- cgit From e346bebbd36b1576a3335331fed61bb48c6d8823 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 29 Nov 2022 18:23:08 +0100 Subject: efi: libstub: Always enable initrd command line loader and bump version In preparation for setting a cross-architecture baseline for EFI boot support, remove the Kconfig option that permits the command line initrd loader to be disabled. Also, bump the minor version so that any image built with the new version can be identified as supporting this. Acked-by: Leif Lindholm Reviewed-by: Daniel Kiper Signed-off-by: Ard Biesheuvel --- include/linux/pe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pe.h b/include/linux/pe.h index 1d3836ef9d92..056a1762de90 100644 --- a/include/linux/pe.h +++ b/include/linux/pe.h @@ -29,7 +29,7 @@ * handover_offset and xloadflags fields in the bootparams structure. */ #define LINUX_EFISTUB_MAJOR_VERSION 0x1 -#define LINUX_EFISTUB_MINOR_VERSION 0x0 +#define LINUX_EFISTUB_MINOR_VERSION 0x1 #define MZ_MAGIC 0x5a4d /* "MZ" */ -- cgit From e3ea75ee651daf5e434afbfdb7dbf75e200ea1f6 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 4 Oct 2022 15:58:03 +0200 Subject: ext4: journal_path mount options should follow links Before the commit 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") ext4 mount option journal_path did follow links in the provided path. Bring this behavior back by allowing to pass pathwalk flags to fs_lookup_param(). Fixes: 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") Signed-off-by: Lukas Czerner Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20221004135803.32283-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- include/linux/fs_parser.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index f103c91139d4..01542c4b87a2 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -76,6 +76,7 @@ static inline int fs_parse(struct fs_context *fc, extern int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, + unsigned int flags, struct path *_path); extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); -- cgit From 229d58e31678dece365060c50a39a99a2b1dc729 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Nov 2022 17:03:24 +0000 Subject: firmware: arm_ffa: Move constants to header file FF-A function IDs and error codes will be needed in the hypervisor too, so move to them to the header file where they can be shared. Rename the version constants with an "FFA_" prefix so that they are less likely to clash with other code in the tree. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Quentin Perret Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20221116170335.2341003-2-qperret@google.com Signed-off-by: Will Deacon --- include/linux/arm_ffa.h | 83 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 5f02d2e6b9d9..daff44d777fa 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -11,6 +11,89 @@ #include #include +#define FFA_SMC(calling_convention, func_num) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, (calling_convention), \ + ARM_SMCCC_OWNER_STANDARD, (func_num)) + +#define FFA_SMC_32(func_num) FFA_SMC(ARM_SMCCC_SMC_32, (func_num)) +#define FFA_SMC_64(func_num) FFA_SMC(ARM_SMCCC_SMC_64, (func_num)) + +#define FFA_ERROR FFA_SMC_32(0x60) +#define FFA_SUCCESS FFA_SMC_32(0x61) +#define FFA_INTERRUPT FFA_SMC_32(0x62) +#define FFA_VERSION FFA_SMC_32(0x63) +#define FFA_FEATURES FFA_SMC_32(0x64) +#define FFA_RX_RELEASE FFA_SMC_32(0x65) +#define FFA_RXTX_MAP FFA_SMC_32(0x66) +#define FFA_FN64_RXTX_MAP FFA_SMC_64(0x66) +#define FFA_RXTX_UNMAP FFA_SMC_32(0x67) +#define FFA_PARTITION_INFO_GET FFA_SMC_32(0x68) +#define FFA_ID_GET FFA_SMC_32(0x69) +#define FFA_MSG_POLL FFA_SMC_32(0x6A) +#define FFA_MSG_WAIT FFA_SMC_32(0x6B) +#define FFA_YIELD FFA_SMC_32(0x6C) +#define FFA_RUN FFA_SMC_32(0x6D) +#define FFA_MSG_SEND FFA_SMC_32(0x6E) +#define FFA_MSG_SEND_DIRECT_REQ FFA_SMC_32(0x6F) +#define FFA_FN64_MSG_SEND_DIRECT_REQ FFA_SMC_64(0x6F) +#define FFA_MSG_SEND_DIRECT_RESP FFA_SMC_32(0x70) +#define FFA_FN64_MSG_SEND_DIRECT_RESP FFA_SMC_64(0x70) +#define FFA_MEM_DONATE FFA_SMC_32(0x71) +#define FFA_FN64_MEM_DONATE FFA_SMC_64(0x71) +#define FFA_MEM_LEND FFA_SMC_32(0x72) +#define FFA_FN64_MEM_LEND FFA_SMC_64(0x72) +#define FFA_MEM_SHARE FFA_SMC_32(0x73) +#define FFA_FN64_MEM_SHARE FFA_SMC_64(0x73) +#define FFA_MEM_RETRIEVE_REQ FFA_SMC_32(0x74) +#define FFA_FN64_MEM_RETRIEVE_REQ FFA_SMC_64(0x74) +#define FFA_MEM_RETRIEVE_RESP FFA_SMC_32(0x75) +#define FFA_MEM_RELINQUISH FFA_SMC_32(0x76) +#define FFA_MEM_RECLAIM FFA_SMC_32(0x77) +#define FFA_MEM_OP_PAUSE FFA_SMC_32(0x78) +#define FFA_MEM_OP_RESUME FFA_SMC_32(0x79) +#define FFA_MEM_FRAG_RX FFA_SMC_32(0x7A) +#define FFA_MEM_FRAG_TX FFA_SMC_32(0x7B) +#define FFA_NORMAL_WORLD_RESUME FFA_SMC_32(0x7C) + +/* + * For some calls it is necessary to use SMC64 to pass or return 64-bit values. + * For such calls FFA_FN_NATIVE(name) will choose the appropriate + * (native-width) function ID. + */ +#ifdef CONFIG_64BIT +#define FFA_FN_NATIVE(name) FFA_FN64_##name +#else +#define FFA_FN_NATIVE(name) FFA_##name +#endif + +/* FFA error codes. */ +#define FFA_RET_SUCCESS (0) +#define FFA_RET_NOT_SUPPORTED (-1) +#define FFA_RET_INVALID_PARAMETERS (-2) +#define FFA_RET_NO_MEMORY (-3) +#define FFA_RET_BUSY (-4) +#define FFA_RET_INTERRUPTED (-5) +#define FFA_RET_DENIED (-6) +#define FFA_RET_RETRY (-7) +#define FFA_RET_ABORTED (-8) + +/* FFA version encoding */ +#define FFA_MAJOR_VERSION_MASK GENMASK(30, 16) +#define FFA_MINOR_VERSION_MASK GENMASK(15, 0) +#define FFA_MAJOR_VERSION(x) ((u16)(FIELD_GET(FFA_MAJOR_VERSION_MASK, (x)))) +#define FFA_MINOR_VERSION(x) ((u16)(FIELD_GET(FFA_MINOR_VERSION_MASK, (x)))) +#define FFA_PACK_VERSION_INFO(major, minor) \ + (FIELD_PREP(FFA_MAJOR_VERSION_MASK, (major)) | \ + FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor))) +#define FFA_VERSION_1_0 FFA_PACK_VERSION_INFO(1, 0) + +/** + * FF-A specification mentions explicitly about '4K pages'. This should + * not be confused with the kernel PAGE_SIZE, which is the translation + * granule kernel is configured and may be one among 4K, 16K and 64K. + */ +#define FFA_PAGE_SIZE SZ_4K + /* FFA Bus/Device/Driver related */ struct ffa_device { int vm_id; -- cgit From c8e320b00a2a720862b9c028153c681b1a50aa61 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Nov 2022 17:03:25 +0000 Subject: firmware: arm_ffa: Move comment before the field it is documenting This is consistent with the other comments in the struct. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Quentin Perret Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20221116170335.2341003-3-qperret@google.com Signed-off-by: Will Deacon --- include/linux/arm_ffa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index daff44d777fa..c87aeecaa9b2 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -244,11 +244,11 @@ struct ffa_mem_region_attributes { */ #define FFA_MEM_RETRIEVE_SELF_BORROWER BIT(0) u8 flag; - u32 composite_off; /* * Offset in bytes from the start of the outer `ffa_memory_region` to * an `struct ffa_mem_region_addr_range`. */ + u32 composite_off; u64 reserved; }; -- cgit From d209ce353a324601f9d3c1eee43b1f0df53021b3 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 1 Dec 2022 20:58:18 +0800 Subject: blk-crypto: Add support for SM4-XTS blk crypto mode SM4 is a symmetric cipher algorithm widely used in China. The SM4-XTS variant is used to encrypt length-preserving data. This is the mandatory algorithm in some special scenarios. Add support for the algorithm to block inline encryption. This is needed for the inlinecrypt mount option to be supported via blk-crypto-fallback, as it is for the other fscrypt modes. Signed-off-by: Tianjia Zhang Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221201125819.36932-2-tianjia.zhang@linux.alibaba.com --- include/linux/blk-crypto.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 69b24fe92cbf..26b1b71c3091 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -13,6 +13,7 @@ enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_AES_256_XTS, BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, BLK_ENCRYPTION_MODE_ADIANTUM, + BLK_ENCRYPTION_MODE_SM4_XTS, BLK_ENCRYPTION_MODE_MAX, }; -- cgit From dcedadfae28562ad04bc351cabfbc0c65b810847 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 30 Nov 2022 12:22:44 -0700 Subject: nvdimm/cxl/pmem: Add support for master passphrase disable security command The original nvdimm_security_ops ->disable() only supports user passphrase for security disable. The CXL spec introduced the disabling of master passphrase. Add a ->disable_master() callback to support this new operation and leaving the old ->disable() mechanism alone. A "disable_master" command is added for the sysfs attribute in order to allow command to be issued from userspace. ndctl will need enabling in order to utilize this new operation. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/166983616454.2734609.14204031148234398086.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index c74acfa1a3fe..3bf658a74ccb 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -183,6 +183,8 @@ struct nvdimm_security_ops { int (*overwrite)(struct nvdimm *nvdimm, const struct nvdimm_key_data *key_data); int (*query_overwrite)(struct nvdimm *nvdimm); + int (*disable_master)(struct nvdimm *nvdimm, + const struct nvdimm_key_data *key_data); }; enum nvdimm_fwa_state { -- cgit From eb8c507296f6038d46010396d91b42a05c3b64d9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:55 +0000 Subject: jump_label: Prevent key->enabled int overflow 1. With CONFIG_JUMP_LABEL=n static_key_slow_inc() doesn't have any protection against key->enabled refcounter overflow. 2. With CONFIG_JUMP_LABEL=y static_key_slow_inc_cpuslocked() still may turn the refcounter negative as (v + 1) may overflow. key->enabled is indeed a ref-counter as it's documented in multiple places: top comment in jump_label.h, Documentation/staging/static-keys.rst, etc. As -1 is reserved for static key that's in process of being enabled, functions would break with negative key->enabled refcount: - for CONFIG_JUMP_LABEL=n negative return of static_key_count() breaks static_key_false(), static_key_true() - the ref counter may become 0 from negative side by too many static_key_slow_inc() calls and lead to use-after-free issues. These flaws result in that some users have to introduce an additional mutex and prevent the reference counter from overflowing themselves, see bpf_enable_runtime_stats() checking the counter against INT_MAX / 2. Prevent the reference counter overflow by checking if (v + 1) > 0. Change functions API to return whether the increment was successful. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jakub Kicinski --- include/linux/jump_label.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 570831ca9951..4e968ebadce6 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -224,9 +224,10 @@ extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); -extern void static_key_slow_inc(struct static_key *key); +extern bool static_key_slow_inc(struct static_key *key); +extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); -extern void static_key_slow_inc_cpuslocked(struct static_key *key); +extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); @@ -278,11 +279,23 @@ static __always_inline bool static_key_true(struct static_key *key) return false; } -static inline void static_key_slow_inc(struct static_key *key) +static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { + int v; + STATIC_KEY_CHECK_USE(key); - atomic_inc(&key->enabled); + /* + * Prevent key->enabled getting negative to follow the same semantics + * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. + */ + v = atomic_read(&key->enabled); + do { + if (v < 0 || (v + 1) < 0) + return false; + } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); + return true; } +#define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { -- cgit From f9231a996e229c13d23f907352c2cea84bd1c30a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 28 Nov 2022 14:15:36 +1000 Subject: module: add module_elf_check_arch for module-specific checks The elf_check_arch() function is also used to test compatibility of usermode binaries. Kernel modules may have more specific requirements, for example powerpc would like to test for ABI version compatibility. Add a weak module_elf_check_arch() that defaults to true, and call it from elf_validity_check(). Signed-off-by: Jessica Yu [np: added changelog, adjust name, rebase] Acked-by: Luis Chamberlain Signed-off-by: Nicholas Piggin Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221128041539.1742489-2-npiggin@gmail.com --- include/linux/moduleloader.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 9e09d11ffe5b..7b4587a19189 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -13,6 +13,9 @@ * must be implemented by each architecture. */ +/* arch may override to do additional checking of ELF header architecture */ +bool module_elf_check_arch(Elf_Ehdr *hdr); + /* Adjust arch-specific sections. Return 0 on success. */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, -- cgit From 77992f896745c63ae64bfccfdc429ab7b3d88da5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 23 Nov 2022 12:24:16 +0100 Subject: configfs: remove mentions of committable items A proposition of implementation of committable items has been rejected due to the gpio-sim module being the only user and configfs not getting much development in general. In that case, let's remove the notion of committable items from docs and headers. Signed-off-by: Bartosz Golaszewski Signed-off-by: Christoph Hellwig --- include/linux/configfs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 97cfd13bae51..2606711adb18 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -204,8 +204,6 @@ static struct configfs_bin_attribute _pfx##attr_##_name = { \ * group children. default_groups may coexist alongsize make_group() or * make_item(), but if the group wishes to have only default_groups * children (disallowing mkdir(2)), it need not provide either function. - * If the group has commit(), it supports pending and committed (active) - * items. */ struct configfs_item_operations { void (*release)(struct config_item *); @@ -216,7 +214,6 @@ struct configfs_item_operations { struct configfs_group_operations { struct config_item *(*make_item)(struct config_group *group, const char *name); struct config_group *(*make_group)(struct config_group *group, const char *name); - int (*commit_item)(struct config_item *item); void (*disconnect_notify)(struct config_group *group, struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); }; -- cgit From e634ac4a8aaab37bdc69177df9b40acf92eccc6d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:31 +0800 Subject: crypto: api - Add crypto_tfm_ctx_dma This patch adds the helpers crypto_tfm_ctx_aligned and crypto_tfm_ctx_dma. The first aligns the tfm context to the value cra_alignmask. The second sets the alignment according to dma_cache_get_alignment(); This patch also moves crypto_tfm_ctx into algapi.h. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 2324ab6f1846..5d1e961f810e 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -714,11 +714,6 @@ static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags) tfm->crt_flags &= ~flags; } -static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) -{ - return tfm->__crt_ctx; -} - static inline unsigned int crypto_tfm_ctx_alignment(void) { struct crypto_tfm *tfm; -- cgit From d9a4af5690e26afa8a2eb83c575d3a9ef52cde1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Nov 2022 17:27:14 +0106 Subject: printk: Convert console_drivers list to hlist Replace the open coded single linked list with a hlist so a conversion to SRCU protected list walks can reuse the existing primitives. Co-developed-by: John Ogness Signed-off-by: John Ogness Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-3-john.ogness@linutronix.de --- include/linux/console.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 8c1686e2c233..7b5f21f9e469 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -15,6 +15,7 @@ #define _LINUX_CONSOLE_H_ 1 #include +#include #include struct vc_data; @@ -154,14 +155,16 @@ struct console { u64 seq; unsigned long dropped; void *data; - struct console *next; + struct hlist_node node; }; +extern struct hlist_head console_list; + /* * for_each_console() allows you to iterate on each console */ #define for_each_console(con) \ - for (con = console_drivers; con != NULL; con = con->next) + hlist_for_each_entry(con, &console_list, node) extern int console_set_on_cmdline; extern struct console *early_console; @@ -174,7 +177,6 @@ enum con_flush_mode { extern int add_preferred_console(char *name, int idx, char *options); extern void register_console(struct console *); extern int unregister_console(struct console *); -extern struct console *console_drivers; extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); -- cgit From 6c4afa79147e4aa86665795695ac4a5f25e73176 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:15 +0106 Subject: printk: Prepare for SRCU console list protection Provide an NMI-safe SRCU protected variant to walk the console list. Note that all console fields are now set before adding the console to the list to avoid the console becoming visible by SCRU readers before being fully initialized. This is a preparatory change for a new console infrastructure which operates independent of the console BKL. Suggested-by: Thomas Gleixner Signed-off-by: John Ogness Acked-by: Miguel Ojeda Reviewed-by: Paul E. McKenney Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-4-john.ogness@linutronix.de --- include/linux/console.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 7b5f21f9e469..f4f0c9523835 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -15,7 +15,7 @@ #define _LINUX_CONSOLE_H_ 1 #include -#include +#include #include struct vc_data; @@ -158,8 +158,34 @@ struct console { struct hlist_node node; }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern bool console_srcu_read_lock_is_held(void); +#else +static inline bool console_srcu_read_lock_is_held(void) +{ + return 1; +} +#endif + +extern int console_srcu_read_lock(void); +extern void console_srcu_read_unlock(int cookie); + extern struct hlist_head console_list; +/** + * for_each_console_srcu() - Iterator over registered consoles + * @con: struct console pointer used as loop cursor + * + * Although SRCU guarantees the console list will be consistent, the + * struct console fields may be updated by other CPUs while iterating. + * + * Requires console_srcu_read_lock to be held. Can be invoked from + * any context. + */ +#define for_each_console_srcu(con) \ + hlist_for_each_entry_srcu(con, &console_list, node, \ + console_srcu_read_lock_is_held()) + /* * for_each_console() allows you to iterate on each console */ -- cgit From 4dc64682ad37abb02c54ca598430ac27de60c21a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 21 Nov 2022 12:16:12 +0106 Subject: printk: introduce console_list_lock Currently there exist races in register_console(), where the types of registered consoles are checked (without holding the console_lock) and then after acquiring the console_lock, it is assumed that the list has not changed. Also, some code that performs console_unregister() make similar assumptions. It might be possible to fix these races using the console_lock. But it would require a complex analysis of all console drivers to make sure that the console_lock is not taken in match() and setup() callbacks. And we really prefer to split up and reduce the responsibilities of console_lock rather than expand its complexity. Therefore, introduce a new console_list_lock to provide full synchronization for any console list changes. In addition, also use console_list_lock for synchronization of console->flags updates. All flags are either static or modified only during the console registration. There are only two exceptions. The first exception is CON_ENABLED, which is also modified by console_start()/console_stop(). Therefore, these functions must also take the console_list_lock. The second exception is when the flags are modified by the console driver init code before the console is registered. These will be ignored because they are not visible to the rest of the system via the console_drivers list. Note that one of the various responsibilities of the console_lock is also intended to provide console list and console->flags synchronization. Later changes will update call sites relying on the console_lock for these purposes. Once all call sites have been updated, the console_lock will be relieved of synchronizing console_list and console->flags updates. Suggested-by: Thomas Gleixner Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/87sficwokr.fsf@jogness.linutronix.de --- include/linux/console.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index f4f0c9523835..24d83e24840b 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -158,6 +158,14 @@ struct console { struct hlist_node node; }; +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_console_list_lock_held(void); +#else +static inline void lockdep_assert_console_list_lock_held(void) +{ +} +#endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else @@ -170,6 +178,9 @@ static inline bool console_srcu_read_lock_is_held(void) extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); +extern void console_list_lock(void) __acquires(console_mutex); +extern void console_list_unlock(void) __releases(console_mutex); + extern struct hlist_head console_list; /** @@ -186,10 +197,16 @@ extern struct hlist_head console_list; hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) -/* - * for_each_console() allows you to iterate on each console +/** + * for_each_console() - Iterator over registered consoles + * @con: struct console pointer used as loop cursor + * + * The console list and the console->flags are immutable while iterating. + * + * Requires console_list_lock to be held. */ -#define for_each_console(con) \ +#define for_each_console(con) \ + lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) extern int console_set_on_cmdline; -- cgit From 100bdef2c198b8dc64df011dc4a2152db913c8ba Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:24 +0106 Subject: console: introduce wrappers to read/write console flags After switching to SRCU for console list iteration, some readers will begin readings console->flags as a data race. Locklessly reading console->flags provides a consistent value because there is at most one CPU modifying console->flags and that CPU is using only read-modify-write operations. Introduce a wrapper for SRCU iterators to read console flags. Introduce a matching wrapper to write to flags of registered consoles. Writing to flags of registered consoles is synchronized by the console_list_lock. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-13-john.ogness@linutronix.de --- include/linux/console.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 24d83e24840b..c1ca461d088a 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -183,6 +183,51 @@ extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; +/** + * console_srcu_read_flags - Locklessly read the console flags + * @con: struct console pointer of console to read flags from + * + * This function provides the necessary READ_ONCE() and data_race() + * notation for locklessly reading the console flags. The READ_ONCE() + * in this function matches the WRITE_ONCE() when @flags are modified + * for registered consoles with console_srcu_write_flags(). + * + * Only use this function to read console flags when locklessly + * iterating the console list via srcu. + * + * Context: Any context. + */ +static inline short console_srcu_read_flags(const struct console *con) +{ + WARN_ON_ONCE(!console_srcu_read_lock_is_held()); + + /* + * Locklessly reading console->flags provides a consistent + * read value because there is at most one CPU modifying + * console->flags and that CPU is using only read-modify-write + * operations to do so. + */ + return data_race(READ_ONCE(con->flags)); +} + +/** + * console_srcu_write_flags - Write flags for a registered console + * @con: struct console pointer of console to write flags to + * @flags: new flags value to write + * + * Only use this function to write flags for registered consoles. It + * requires holding the console_list_lock. + * + * Context: Any context. + */ +static inline void console_srcu_write_flags(struct console *con, short flags) +{ + lockdep_assert_console_list_lock_held(); + + /* This matches the READ_ONCE() in console_srcu_read_flags(). */ + WRITE_ONCE(con->flags, flags); +} + /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor -- cgit From 1fd4224a6b641f1949e27bf350b5b1c1e47e2ccc Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:33 +0106 Subject: console: introduce console_is_registered() Currently it is not possible for drivers to detect if they have already successfully registered their console. Several drivers have multiple paths that lead to console registration. To avoid attempting a 2nd registration (which leads to a WARN), drivers are implementing their own solution. Introduce console_is_registered() so drivers can easily identify if their console is currently registered. A _locked() variant is also provided if the caller is already holding the console_list_lock. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-22-john.ogness@linutronix.de --- include/linux/console.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index c1ca461d088a..f716e1dd9eaf 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -228,6 +228,34 @@ static inline void console_srcu_write_flags(struct console *con, short flags) WRITE_ONCE(con->flags, flags); } +/* Variant of console_is_registered() when the console_list_lock is held. */ +static inline bool console_is_registered_locked(const struct console *con) +{ + lockdep_assert_console_list_lock_held(); + return !hlist_unhashed(&con->node); +} + +/* + * console_is_registered - Check if the console is registered + * @con: struct console pointer of console to check + * + * Context: Process context. May sleep while acquiring console list lock. + * Return: true if the console is in the console list, otherwise false. + * + * If false is returned for a console that was previously registered, it + * can be assumed that the console's unregistration is fully completed, + * including the exit() callback after console list removal. + */ +static inline bool console_is_registered(const struct console *con) +{ + bool ret; + + console_list_lock(); + ret = console_is_registered_locked(con); + console_list_unlock(); + return ret; +} + /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor -- cgit From 452b9b24754044eced1508f9090611f3d9aa4ca5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:34 +0106 Subject: serial_core: replace uart_console_enabled() with uart_console_registered() All users of uart_console_enabled() really want to know if a console is registered. It is not reliable to check for CON_ENABLED in order to identify if a console is registered. Use console_is_registered() instead. A _locked() variant is provided because uart_set_options() is always called with the console_list_lock held and must check if a console is registered in order to synchronize with kgdboc. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-23-john.ogness@linutronix.de --- include/linux/serial_core.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index d657f2a42a7b..91871464b99d 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -743,9 +743,15 @@ static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED; static inline int setup_earlycon(char *buf) { return 0; } #endif -static inline bool uart_console_enabled(struct uart_port *port) +/* Variant of uart_console_registered() when the console_list_lock is held. */ +static inline bool uart_console_registered_locked(struct uart_port *port) { - return uart_console(port) && (port->cons->flags & CON_ENABLED); + return uart_console(port) && console_is_registered_locked(port->cons); +} + +static inline bool uart_console_registered(struct uart_port *port) +{ + return uart_console(port) && console_is_registered(port->cons); } struct uart_port *uart_get_console(struct uart_port *ports, int nr, -- cgit From 6f8836756f3cbb02bfd1f0e033516585250318a9 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:44 +0106 Subject: printk, xen: fbfront: create/use safe function for forcing preferred With commit 9e124fe16ff2("xen: Enable console tty by default in domU if it's not a dummy") a hack was implemented to make sure that the tty console remains the console behind the /dev/console device. The main problem with the hack is that, after getting the console pointer to the tty console, it is assumed the pointer is still valid after releasing the console_sem. This assumption is incorrect and unsafe. Make the hack safe by introducing a new function console_force_preferred_locked() and perform the full operation under the console_list_lock. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-33-john.ogness@linutronix.de --- include/linux/console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index f716e1dd9eaf..9cea254b34b8 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -291,6 +291,7 @@ enum con_flush_mode { }; extern int add_preferred_console(char *name, int idx, char *options); +extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); -- cgit From dafc7cde23dca239987d3cd000b11cdccc3728ea Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Tue, 29 Nov 2022 16:57:06 +0100 Subject: regulator: add mt6357 regulator Add regulator driver for the MT6357 PMIC. Signed-off-by: Fabien Parent Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Alexandre Mergnat Link: https://lore.kernel.org/r/20221005-mt6357-support-v7-7-477e60126749@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/mt6357-regulator.h | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 include/linux/regulator/mt6357-regulator.h (limited to 'include/linux') diff --git a/include/linux/regulator/mt6357-regulator.h b/include/linux/regulator/mt6357-regulator.h new file mode 100644 index 000000000000..238b1ee77ea6 --- /dev/null +++ b/include/linux/regulator/mt6357-regulator.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 MediaTek Inc. + */ + +#ifndef __LINUX_REGULATOR_MT6357_H +#define __LINUX_REGULATOR_MT6357_H + +enum { + /* Bucks */ + MT6357_ID_VCORE, + MT6357_ID_VMODEM, + MT6357_ID_VPA, + MT6357_ID_VPROC, + MT6357_ID_VS1, + + /* LDOs */ + MT6357_ID_VAUX18, + MT6357_ID_VAUD28, + MT6357_ID_VCAMA, + MT6357_ID_VCAMD, + MT6357_ID_VCAMIO, + MT6357_ID_VCN18, + MT6357_ID_VCN28, + MT6357_ID_VCN33_BT, + MT6357_ID_VCN33_WIFI, + MT6357_ID_VDRAM, + MT6357_ID_VEFUSE, + MT6357_ID_VEMC, + MT6357_ID_VFE28, + MT6357_ID_VIBR, + MT6357_ID_VIO18, + MT6357_ID_VIO28, + MT6357_ID_VLDO28, + MT6357_ID_VMC, + MT6357_ID_VMCH, + MT6357_ID_VRF12, + MT6357_ID_VRF18, + MT6357_ID_VSIM1, + MT6357_ID_VSIM2, + MT6357_ID_VSRAM_OTHERS, + MT6357_ID_VSRAM_PROC, + MT6357_ID_VUSB33, + MT6357_ID_VXO22, + + MT6357_ID_RG_MAX, +}; + +#define MT6357_MAX_REGULATOR MT6357_ID_RG_MAX + +#endif /* __LINUX_REGULATOR_MT6357_H */ -- cgit From a4d1f91db5021c57e14721ac090616c90386ac70 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:51 -0400 Subject: vfio-iommufd: Support iommufd for physical VFIO devices This creates the iommufd_device for the physical VFIO drivers. These are all the drivers that are calling vfio_register_group_dev() and expect the type1 code to setup a real iommu_domain against their parent struct device. The design gives the driver a choice in how it gets connected to iommufd by providing bind_iommufd/unbind_iommufd/attach_ioas callbacks to implement as required. The core code provides three default callbacks for physical mode using a real iommu_domain. This is suitable for drivers using vfio_register_group_dev() Link: https://lore.kernel.org/r/6-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- include/linux/vfio.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd..a7fc4d747dc2 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -17,6 +17,8 @@ #include struct kvm; +struct iommufd_ctx; +struct iommufd_device; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -54,6 +56,10 @@ struct vfio_device { struct completion comp; struct list_head group_next; struct list_head iommu_entry; +#if IS_ENABLED(CONFIG_IOMMUFD) + struct iommufd_device *iommufd_device; + bool iommufd_attached; +#endif }; /** @@ -80,6 +86,10 @@ struct vfio_device_ops { char *name; int (*init)(struct vfio_device *vdev); void (*release)(struct vfio_device *vdev); + int (*bind_iommufd)(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); + void (*unbind_iommufd)(struct vfio_device *vdev); + int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -96,6 +106,21 @@ struct vfio_device_ops { void __user *arg, size_t argsz); }; +#if IS_ENABLED(CONFIG_IOMMUFD) +int vfio_iommufd_physical_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); +void vfio_iommufd_physical_unbind(struct vfio_device *vdev); +int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +#else +#define vfio_iommufd_physical_bind \ + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ + u32 *out_device_id)) NULL) +#define vfio_iommufd_physical_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#endif + /** * @migration_set_state: Optional callback to change the migration state for * devices that support migration. It's mandatory for -- cgit From 4741f2e941298ad7553b65e66624435e14793391 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:52 -0400 Subject: vfio-iommufd: Support iommufd for emulated VFIO devices Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and consist of all the mdev drivers. Like the physical drivers, support for iommufd is provided by the driver supplying the correct standard ops. Provide ops from the core that duplicate what vfio_register_emulated_iommu_dev() does. Emulated drivers are where it is more likely to see variation in the iommfd support ops. For instance IDXD will probably need to setup both a iommfd_device context linked to a PASID and an iommufd_access context to support all their mdev operations. Link: https://lore.kernel.org/r/7-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- include/linux/vfio.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a7fc4d747dc2..d5f84f98c0fa 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -19,6 +19,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; +struct iommufd_access; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -56,8 +57,10 @@ struct vfio_device { struct completion comp; struct list_head group_next; struct list_head iommu_entry; + struct iommufd_access *iommufd_access; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct iommufd_ctx *iommufd_ictx; bool iommufd_attached; #endif }; @@ -111,6 +114,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +int vfio_iommufd_emulated_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); +void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); +int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); #else #define vfio_iommufd_physical_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ @@ -119,6 +126,13 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); ((void (*)(struct vfio_device *vdev)) NULL) #define vfio_iommufd_physical_attach_ioas \ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#define vfio_iommufd_emulated_bind \ + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ + u32 *out_device_id)) NULL) +#define vfio_iommufd_emulated_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_emulated_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #endif /** -- cgit From 30ee198ce42d60101620d33f8bc70c3234798365 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:10 +0100 Subject: KVM: Reference to kvm_userspace_memory_region in doc and comments There are still references to the removed kvm_memory_region data structure but the doc and comments should mention struct kvm_userspace_memory_region instead, since that is what's used by the ioctl that replaced the old one and this data structure support the same set of flags. Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-4-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f874a964313..a5a82b536774 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -50,8 +50,8 @@ #endif /* - * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used - * in kvm, other bits are visible for userspace which are defined in + * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally + * used in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) -- cgit From b9069728a70c23dad00684eb994a3f5295f127cf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 1 Dec 2022 23:39:53 -0800 Subject: seccomp: document the "filter_count" field Add missing kernel-doc for the 'filter_count' field in struct seccomp. seccomp.h:40: warning: Function parameter or member 'filter_count' not described in 'seccomp' Fixes: c818c03b661c ("seccomp: Report number of loaded filters in /proc/$pid/status") Signed-off-by: Randy Dunlap Cc: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221202073953.14677-1-rdunlap@infradead.org --- include/linux/seccomp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index d31d76be4982..175079552f68 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -27,6 +27,7 @@ struct seccomp_filter; * * @mode: indicates one of the valid values above for controlled * system calls available to a process. + * @filter_count: number of seccomp filters * @filter: must always point to a valid seccomp-filter or NULL as it is * accessed without locking during system call entry. * -- cgit From f40eb99897af665f11858dd7b56edcb62c3f3c67 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Dec 2022 19:27:58 +0100 Subject: pktcdvd: remove driver. Way back in 2016 in commit 5a8b187c61e9 ("pktcdvd: mark as unmaintained and deprecated") this driver was marked as "will be removed soon". 5 years seems long enough to have it stick around after that, so finally remove the thing now. Reported-by: Christoph Hellwig Cc: Jens Axboe Cc: Thomas Maier Cc: Peter Osterlund Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221202182758.1339039-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- include/linux/pktcdvd.h | 197 ------------------------------------------------ 1 file changed, 197 deletions(-) delete mode 100644 include/linux/pktcdvd.h (limited to 'include/linux') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h deleted file mode 100644 index f9c5ac80d59b..000000000000 --- a/include/linux/pktcdvd.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (C) 2000 Jens Axboe - * Copyright (C) 2001-2004 Peter Osterlund - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and - * DVD-RW devices. - * - */ -#ifndef __PKTCDVD_H -#define __PKTCDVD_H - -#include -#include -#include -#include -#include -#include -#include - -/* default bio write queue congestion marks */ -#define PKT_WRITE_CONGESTION_ON 10000 -#define PKT_WRITE_CONGESTION_OFF 9000 - - -struct packet_settings -{ - __u32 size; /* packet size in (512 byte) sectors */ - __u8 fp; /* fixed packets */ - __u8 link_loss; /* the rest is specified - * as per Mt Fuji */ - __u8 write_type; - __u8 track_mode; - __u8 block_mode; -}; - -/* - * Very crude stats for now - */ -struct packet_stats -{ - unsigned long pkt_started; - unsigned long pkt_ended; - unsigned long secs_w; - unsigned long secs_rg; - unsigned long secs_r; -}; - -struct packet_cdrw -{ - struct list_head pkt_free_list; - struct list_head pkt_active_list; - spinlock_t active_list_lock; /* Serialize access to pkt_active_list */ - struct task_struct *thread; - atomic_t pending_bios; -}; - -/* - * Switch to high speed reading after reading this many kilobytes - * with no interspersed writes. - */ -#define HI_SPEED_SWITCH 512 - -struct packet_iosched -{ - atomic_t attention; /* Set to non-zero when queue processing is needed */ - int writing; /* Non-zero when writing, zero when reading */ - spinlock_t lock; /* Protecting read/write queue manipulations */ - struct bio_list read_queue; - struct bio_list write_queue; - sector_t last_write; /* The sector where the last write ended */ - int successive_reads; -}; - -/* - * 32 buffers of 2048 bytes - */ -#if (PAGE_SIZE % CD_FRAMESIZE) != 0 -#error "PAGE_SIZE must be a multiple of CD_FRAMESIZE" -#endif -#define PACKET_MAX_SIZE 128 -#define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE) -#define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9) - -enum packet_data_state { - PACKET_IDLE_STATE, /* Not used at the moment */ - PACKET_WAITING_STATE, /* Waiting for more bios to arrive, so */ - /* we don't have to do as much */ - /* data gathering */ - PACKET_READ_WAIT_STATE, /* Waiting for reads to fill in holes */ - PACKET_WRITE_WAIT_STATE, /* Waiting for the write to complete */ - PACKET_RECOVERY_STATE, /* Recover after read/write errors */ - PACKET_FINISHED_STATE, /* After write has finished */ - - PACKET_NUM_STATES /* Number of possible states */ -}; - -/* - * Information needed for writing a single packet - */ -struct pktcdvd_device; - -struct packet_data -{ - struct list_head list; - - spinlock_t lock; /* Lock protecting state transitions and */ - /* orig_bios list */ - - struct bio_list orig_bios; /* Original bios passed to pkt_make_request */ - /* that will be handled by this packet */ - int write_size; /* Total size of all bios in the orig_bios */ - /* list, measured in number of frames */ - - struct bio *w_bio; /* The bio we will send to the real CD */ - /* device once we have all data for the */ - /* packet we are going to write */ - sector_t sector; /* First sector in this packet */ - int frames; /* Number of frames in this packet */ - - enum packet_data_state state; /* Current state */ - atomic_t run_sm; /* Incremented whenever the state */ - /* machine needs to be run */ - long sleep_time; /* Set this to non-zero to make the state */ - /* machine run after this many jiffies. */ - - atomic_t io_wait; /* Number of pending IO operations */ - atomic_t io_errors; /* Number of read/write errors during IO */ - - struct bio *r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */ - struct page *pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE]; - - int cache_valid; /* If non-zero, the data for the zone defined */ - /* by the sector variable is completely cached */ - /* in the pages[] vector. */ - - int id; /* ID number for debugging */ - struct pktcdvd_device *pd; -}; - -struct pkt_rb_node { - struct rb_node rb_node; - struct bio *bio; -}; - -struct packet_stacked_data -{ - struct bio *bio; /* Original read request bio */ - struct pktcdvd_device *pd; -}; -#define PSD_POOL_SIZE 64 - -struct pktcdvd_device -{ - struct block_device *bdev; /* dev attached */ - dev_t pkt_dev; /* our dev */ - char name[20]; - struct packet_settings settings; - struct packet_stats stats; - int refcnt; /* Open count */ - int write_speed; /* current write speed, kB/s */ - int read_speed; /* current read speed, kB/s */ - unsigned long offset; /* start offset */ - __u8 mode_offset; /* 0 / 8 */ - __u8 type; - unsigned long flags; - __u16 mmc3_profile; - __u32 nwa; /* next writable address */ - __u32 lra; /* last recorded address */ - struct packet_cdrw cdrw; - wait_queue_head_t wqueue; - - spinlock_t lock; /* Serialize access to bio_queue */ - struct rb_root bio_queue; /* Work queue of bios we need to handle */ - int bio_queue_size; /* Number of nodes in bio_queue */ - bool congested; /* Someone is waiting for bio_queue_size - * to drop. */ - sector_t current_sector; /* Keep track of where the elevator is */ - atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ - /* needs to be run. */ - mempool_t rb_pool; /* mempool for pkt_rb_node allocations */ - - struct packet_iosched iosched; - struct gendisk *disk; - - int write_congestion_off; - int write_congestion_on; - - struct device *dev; /* sysfs pktcdvd[0-7] dev */ - - struct dentry *dfs_d_root; /* debugfs: devname directory */ - struct dentry *dfs_f_info; /* debugfs: info file */ -}; - -#endif /* __PKTCDVD_H */ -- cgit From 79cc1ba7badf9e7a12af99695a557e9ce27ee967 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:24 -0800 Subject: panic: Consolidate open-coded panic_on_warn checks Several run-time checkers (KASAN, UBSAN, KFENCE, KCSAN, sched) roll their own warnings, and each check "panic_on_warn". Consolidate this into a single function so that future instrumentation can be added in a single location. Cc: Marco Elver Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: Valentin Schneider Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vincenzo Frascino Cc: Andrew Morton Cc: David Gow Cc: tangmeng Cc: Jann Horn Cc: Shuah Khan Cc: Petr Mladek Cc: "Paul E. McKenney" Cc: Sebastian Andrzej Siewior Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: kasan-dev@googlegroups.com Cc: linux-mm@kvack.org Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/r/20221117234328.594699-4-keescook@chromium.org --- include/linux/panic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index c7759b3f2045..979b776e3bcb 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -11,6 +11,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); +void check_panic_on_warn(const char *origin); extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); -- cgit From d662198e03bc7fb4635156ee7e8b8d325e2d8512 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 19:42:55 -0800 Subject: hpet: Replace one-element array with flexible-array member One-element arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace one-element array with flexible-array member in struct hpet. This results in no differences in binary output. The use of struct hpet is never used with sizeof() and accesses via hpet_timers array are already done after explicit bounds checking. [1] https://github.com/KSPP/linux/issues/79 Cc: Clemens Ladisch Cc: "Gustavo A. R. Silva" Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20221118034250.never.999-kees@kernel.org --- include/linux/hpet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hpet.h b/include/linux/hpet.h index 8604564b985d..21e69eaf7a36 100644 --- a/include/linux/hpet.h +++ b/include/linux/hpet.h @@ -30,7 +30,7 @@ struct hpet { unsigned long _hpet_compare; } _u1; u64 hpet_fsb[2]; /* FSB route */ - } hpet_timers[1]; + } hpet_timers[]; }; #define hpet_mc _u0._hpet_mc -- cgit From dc370b28c8425669e7ed5af4c01540645cfb00ec Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 1 Dec 2022 14:03:35 -0800 Subject: nvdimm/region: Move cache management to the region driver Now that cpu_cache_invalidate_memregion() is generically available, use it to centralize CPU cache management in the nvdimm region driver. This trades off removing redundant per-dimm CPU cache flushing with an opportunistic flush on every region disable event to cover the case of sensitive dirty data in the cache being written back to media after a secure erase / overwrite event. Reviewed-by: Davidlohr Bueso Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/166993221550.1995348.16843505129579060258.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3bf658a74ccb..af38252ad704 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -35,6 +35,11 @@ enum { NDD_WORK_PENDING = 4, /* dimm supports namespace labels */ NDD_LABELING = 6, + /* + * dimm contents have changed requiring invalidation of CPU caches prior + * to activation of a region that includes this device + */ + NDD_INCOHERENT = 7, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, -- cgit From 85d6ce58e493ac8b7122e2fbe3f41b94d6ebdc11 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 3 Dec 2022 15:07:47 +0100 Subject: block: remove devnode callback from struct block_device_operations With the removal of the pktcdvd driver, there are no in-kernel users of the devnode callback in struct block_device_operations, so it can be safely removed. If it is needed for new block drivers in the future, it can be brought back. Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221203140747.1942969-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 469299ea0660..2db2ad72af0f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1395,7 +1395,6 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); - char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); -- cgit From 361187e04733eee19778ea9b01cb95a977c14c10 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 30 Nov 2022 15:11:21 -0700 Subject: PCI/AER: Add optional logging callback for correctable error Some new devices such as CXL devices may want to record additional error information on a corrected error. Add a callback to allow the PCI device driver to do additional logging such as providing additional stats for user space RAS monitoring. For CXL device, this is actually a need due to CXL needing to write to the CXL RAS capability structure correctable error status register in order to clear the unmasked correctable errors. See CXL spec rev3.0 8.2.4.16. Suggested-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Acked-by: Bjorn Helgaas Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/166984619233.2804404.3966368388544312674.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dan Williams --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 2bda4a4e47e8..2119a16ecb10 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -843,6 +843,9 @@ struct pci_error_handlers { /* Device driver may resume normal operations */ void (*resume)(struct pci_dev *dev); + + /* Allow device driver to record more details of a correctable error */ + void (*cor_error_detected)(struct pci_dev *dev); }; -- cgit From d93607082e982223cf92750f2d9039ff365b9d24 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 30 Nov 2022 23:28:26 +0100 Subject: net: add netdev_sw_irq_coalesce_default_on() Add a helper for drivers wanting to set SW IRQ coalescing by default. The related sysfs attributes can be used to override the default values. Follow Jakub's suggestion and put this functionality into net core so that drivers wanting to use software interrupt coalescing per default don't have to open-code it. Note that this function needs to be called before the netdevice is registered. Suggested-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..f78db610ada5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -78,6 +78,7 @@ struct xdp_buff; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); +void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ -- cgit From 3e6743e28b9b43d37ced234bdf8e19955d0216f8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 2 Dec 2022 22:13:36 +0100 Subject: random: include in the right header uses DO_ONCE(), so it should include directly. In contrast, does not use code from , so it should be removed. Move the `#include ` line into the right file. Signed-off-by: Christophe JAILLET Fixes: c0842fbc1b18 ("random32: move the pseudo-random 32-bit definitions to prandom.h") Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 1 + include/linux/random.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index 1f4a0de7b019..c94c02ba065c 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -9,6 +9,7 @@ #define _LINUX_PRANDOM_H #include +#include #include #include diff --git a/include/linux/random.h b/include/linux/random.h index b1a34181eed6..4a2a1de423cd 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -6,7 +6,6 @@ #include #include #include -#include #include -- cgit From fca1aa75518c03b04c3c249e9a9134faf9ca18c5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 10:46:02 -0800 Subject: bpf: Handle MEM_RCU type properly Commit 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") introduced MEM_RCU and bpf_rcu_read_lock/unlock() support. In that commit, a rcu pointer is tagged with both MEM_RCU and PTR_TRUSTED so that it can be passed into kfuncs or helpers as an argument. Martin raised a good question in [1] such that the rcu pointer, although being able to accessing the object, might have reference count of 0. This might cause a problem if the rcu pointer is passed to a kfunc which expects trusted arguments where ref count should be greater than 0. This patch makes the following changes related to MEM_RCU pointer: - MEM_RCU pointer might be NULL (PTR_MAYBE_NULL). - Introduce KF_RCU so MEM_RCU ptr can be acquired with a KF_RCU tagged kfunc which assumes ref count of rcu ptr could be zero. - For mem access 'b = ptr->a', say 'ptr' is a MEM_RCU ptr, and 'a' is tagged with __rcu as well. Let us mark 'b' as MEM_RCU | PTR_MAYBE_NULL. [1] https://lore.kernel.org/bpf/ac70f574-4023-664e-b711-e0d3b18117fd@linux.dev/ Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203184602.477272-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/linux/btf.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b5090e89cb3f..c07b351a5bc7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -682,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { diff --git a/include/linux/btf.h b/include/linux/btf.h index 9ed00077db6e..cbd6e4096f8c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,6 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ /* * Return the name of the passed struct, if exists, or halt the build if for -- cgit From c0c852dd1876dc1db4600ce951a92aadd3073b1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 12:49:54 -0800 Subject: bpf: Do not mark certain LSM hook arguments as trusted Martin mentioned that the verifier cannot assume arguments from LSM hook sk_alloc_security being trusted since after the hook is called, the sk ref_count is set to 1. This will overwrite the ref_count changed by the bpf program and may cause ref_count underflow later on. I then further checked some other hooks. For example, for bpf_lsm_file_alloc() hook in fs/file_table.c, f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } atomic_long_set(&f->f_count, 1); The input parameter 'f' to security_file_alloc() cannot be trusted as well. Specifically, I investiaged bpf_map/bpf_prog/file/sk/task alloc/free lsm hooks. Except bpf_map_alloc and task_alloc, arguments for all other hooks should not be considered as trusted. This may not be a complete list, but it covers common usage for sk and task. Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203204954.2043348-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 4bcf76a9bb06..1de7ece5d36d 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -28,6 +28,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); +bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) @@ -51,6 +52,11 @@ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) return false; } +static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { -- cgit From 25f986887dd54a93edcc5cb499b2e42f4d9c359c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 6 Nov 2022 20:34:16 +0100 Subject: hwmon: Include when appropriate The kstrto() functions have been moved from kernel.h to kstrtox.h. So, include the latter directly in the appropriate files. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/51688cf50bda44e2731381a31287c62319388783.1667763218.git.christophe.jaillet@wanadoo.fr Signed-off-by: Guenter Roeck --- include/linux/hwmon-sysfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hwmon-sysfs.h b/include/linux/hwmon-sysfs.h index cb26d02f52f3..d896713359cd 100644 --- a/include/linux/hwmon-sysfs.h +++ b/include/linux/hwmon-sysfs.h @@ -8,6 +8,7 @@ #define _LINUX_HWMON_SYSFS_H #include +#include struct sensor_device_attribute{ struct device_attribute dev_attr; -- cgit From 59882c7f6714141300882af3d39ca6ffecf54ec2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 20 Nov 2022 10:34:41 +0100 Subject: hwmon: (gsc-hwmon) Switch to flexible array to simplify code Using flexible array is more straight forward. It - saves 1 pointer in the 'gsc_hwmon_platform_data' structure - saves an indirection when using this array - saves some LoC and avoids some always spurious pointer arithmetic Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/61a23e1d642397cfcecc4ac3bb0ab485d257987d.1668936855.git.christophe.jaillet@wanadoo.fr Signed-off-by: Guenter Roeck --- include/linux/platform_data/gsc_hwmon.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/gsc_hwmon.h b/include/linux/platform_data/gsc_hwmon.h index 281f499eda97..f2781aa7eff8 100644 --- a/include/linux/platform_data/gsc_hwmon.h +++ b/include/linux/platform_data/gsc_hwmon.h @@ -29,18 +29,17 @@ struct gsc_hwmon_channel { /** * struct gsc_hwmon_platform_data - platform data for gsc_hwmon driver - * @channels: pointer to array of gsc_hwmon_channel structures - * describing channels * @nchannels: number of elements in @channels array * @vreference: voltage reference (mV) * @resolution: ADC bit resolution * @fan_base: register base for FAN controller + * @channels: array of gsc_hwmon_channel structures describing channels */ struct gsc_hwmon_platform_data { - const struct gsc_hwmon_channel *channels; int nchannels; unsigned int resolution; unsigned int vreference; unsigned int fan_base; + struct gsc_hwmon_channel channels[]; }; #endif -- cgit From 29636a5ce87bebab38c533175d72bb800a7581b8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 9 Nov 2022 15:16:11 +0100 Subject: efi: Put Linux specific magic number in the DOS header GRUB currently relies on the magic number in the image header of ARM and arm64 EFI kernel images to decide whether or not the image in question is a bootable kernel. However, the purpose of the magic number is to identify the image as one that implements the bare metal boot protocol, and so GRUB, which only does EFI boot, is limited unnecessarily to booting images that could potentially be booted in a non-EFI manner as well. This is problematic for the new zboot decompressor image format, as it can only boot in EFI mode, and must therefore not use the bare metal boot magic number in its header. For this reason, the strict magic number was dropped from GRUB, to permit essentially any kind of EFI executable to be booted via the 'linux' command, blurring the line between the linux loader and the chainloader. So let's use the same field in the DOS header that RISC-V and arm64 already use for their 'bare metal' magic numbers to store a 'generic Linux kernel' magic number, which can be used to identify bootable kernel images in PE format which don't necessarily implement a bare metal boot protocol in the same binary. Note that, in the context of EFI, the MS-DOS header is only described in terms of the fields that it shares with the hybrid PE/COFF image format, (i.e., the MS-DOS EXE magic number at offset #0 and the PE header offset at byte offset #0x3c). Since we aim for compatibility with EFI only, and not with MS-DOS or MS-Windows, we can use the remaining space in the MS-DOS header however we want. Let's set the generic magic number for x86 images as well: existing bootloaders already have their own methods to identify x86 Linux images that can be booted in a non-EFI manner, and having the magic number in place there will ease any future transitions in loader implementations to merge the x86 and non-x86 EFI boot paths. Note that 32-bit ARM already uses the same location in the header for a different purpose, but the ARM support is already widely implemented and the EFI zboot decompressor is not available on ARM anyway, so we just disregard it here. Acked-by: Leif Lindholm Reviewed-by: Daniel Kiper Signed-off-by: Ard Biesheuvel --- include/linux/pe.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pe.h b/include/linux/pe.h index 056a1762de90..6ffabf1e6d03 100644 --- a/include/linux/pe.h +++ b/include/linux/pe.h @@ -31,6 +31,13 @@ #define LINUX_EFISTUB_MAJOR_VERSION 0x1 #define LINUX_EFISTUB_MINOR_VERSION 0x1 +/* + * LINUX_PE_MAGIC appears at offset 0x38 into the MS-DOS header of EFI bootable + * Linux kernel images that target the architecture as specified by the PE/COFF + * header machine type field. + */ +#define LINUX_PE_MAGIC 0x818223cd + #define MZ_MAGIC 0x5a4d /* "MZ" */ #define PE_MAGIC 0x00004550 /* "PE\0\0" */ -- cgit From 919e43fad5163a8ceb39826ecdee897a9f799351 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:29 +0200 Subject: xfrm: add an interface to offload policy Extend netlink interface to add and delete XFRM policy from the device. This functionality is a first step to implement packet IPsec offload solution. Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..4096e3fe8e4a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,9 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + int (*xdo_dev_policy_add) (struct xfrm_policy *x); + void (*xdo_dev_policy_delete) (struct xfrm_policy *x); + void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif -- cgit From f3da86dc2c8c9004445cfbb15ac086773622d853 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:33 +0200 Subject: xfrm: add support to HW update soft and hard limits Both in RX and TX, the traffic that performs IPsec packet offload transformation is accounted by HW. It is needed to properly handle hard limits that require to drop the packet. It means that XFRM core needs to update internal counters with the one that accounted by the HW, so new callbacks are introduced in this patch. In case of soft or hard limit is occurred, the driver should call to xfrm_state_check_expire() that will perform key rekeying exactly as done by XFRM core. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4096e3fe8e4a..29ae964e3b89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,7 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + void (*xdo_dev_state_update_curlft) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); -- cgit From a46e9010124256f5bf5fc2c241a45cf1944b768e Mon Sep 17 00:00:00 2001 From: Revanth Kumar Uppala Date: Thu, 1 Dec 2022 15:58:43 +0000 Subject: net: stmmac: Power up SERDES after the PHY link The Tegra MGBE ethernet controller requires that the SERDES link is powered-up after the PHY link is up, otherwise the link fails to become ready following a resume from suspend. Add a variable to indicate that the SERDES link must be powered-up after the PHY link. Signed-off-by: Revanth Kumar Uppala Signed-off-by: Jon Hunter Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index fb2e88614f5d..83ca2e8eb6b5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -271,5 +271,6 @@ struct plat_stmmacenet_data { int msi_tx_base_vec; bool use_phy_wol; bool sph_disable; + bool serdes_up_after_phy_linkup; }; #endif -- cgit From 3dad5f9ad99b77dfd234d31e2ea3d77f620efc09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:08 +0100 Subject: genirq/msi: Move IRQ_DOMAIN_MSI_NOMASK_QUIRK to MSI flags It's truly a MSI only flag and for the upcoming per device MSI domains this must be in the MSI flags so it can be set during domain setup without exposing this quirk outside of x86. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.454246167@linutronix.de --- include/linux/irqdomain.h | 9 +-------- include/linux/msi.h | 6 ++++++ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a4af7f81661b..c42c36947f95 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -186,15 +186,8 @@ enum { /* Irq domain implements MSI remapping */ IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), - /* - * Quirk to handle MSI implementations which do not provide - * masking. Currently known to affect x86, but partially - * handled in core code. - */ - IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6), - /* Irq domain doesn't translate anything */ - IRQ_DOMAIN_FLAG_NO_MAP = (1 << 7), + IRQ_DOMAIN_FLAG_NO_MAP = (1 << 6), /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved diff --git a/include/linux/msi.h b/include/linux/msi.h index 1ce9d5e0bfa3..2d87e000bd45 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -399,6 +399,12 @@ enum { MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 9), /* Free MSI descriptors */ MSI_FLAG_FREE_MSI_DESCS = (1 << 10), + /* + * Quirk to handle MSI implementations which do not provide + * masking. Currently known to affect x86, but has to be partially + * handled in the core MSI code. + */ + MSI_FLAG_NOMASK_QUIRK = (1 << 11), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit From f876ea3b7969329d6472ae4126496bb03c89d89a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:10 +0100 Subject: genirq/irqdomain: Make struct irqdomain readable Tabular alignment of both kernel-doc and the actual struct declaration make visual parsing way more conveniant. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.514944367@linutronix.de --- include/linux/irqdomain.h | 74 +++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index c42c36947f95..b1fdd8d309cf 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -117,53 +117,53 @@ struct irq_domain_chip_generic; /** * struct irq_domain - Hardware interrupt number translation object - * @link: Element in global irq_domain list. - * @name: Name of interrupt domain - * @ops: pointer to irq_domain methods - * @host_data: private data pointer for use by owner. Not touched by irq_domain - * core code. - * @flags: host per irq_domain flags - * @mapcount: The number of mapped interrupts + * @link: Element in global irq_domain list. + * @name: Name of interrupt domain + * @ops: Pointer to irq_domain methods + * @host_data: Private data pointer for use by owner. Not touched by irq_domain + * core code. + * @flags: Per irq_domain flags + * @mapcount: The number of mapped interrupts * - * Optional elements - * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy - * to swap it for the of_node via the irq_domain_get_of_node accessor - * @gc: Pointer to a list of generic chips. There is a helper function for - * setting up one or more generic chips for interrupt controllers - * drivers using the generic chip library which uses this pointer. - * @dev: Pointer to a device that the domain represent, and that will be - * used for power management purposes. - * @parent: Pointer to parent irq_domain to support hierarchy irq_domains + * Optional elements: + * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy + * to swap it for the of_node via the irq_domain_get_of_node accessor + * @gc: Pointer to a list of generic chips. There is a helper function for + * setting up one or more generic chips for interrupt controllers + * drivers using the generic chip library which uses this pointer. + * @dev: Pointer to a device that can be utilized for power management + * purposes related to the irq domain. + * @parent: Pointer to parent irq_domain to support hierarchy irq_domains * - * Revmap data, used internally by irq_domain - * @revmap_size: Size of the linear map table @revmap[] - * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map - * @revmap_mutex: Lock for the revmap - * @revmap: Linear table of irq_data pointers + * Revmap data, used internally by the irq domain code: + * @revmap_size: Size of the linear map table @revmap[] + * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map + * @revmap_mutex: Lock for the revmap + * @revmap: Linear table of irq_data pointers */ struct irq_domain { - struct list_head link; - const char *name; - const struct irq_domain_ops *ops; - void *host_data; - unsigned int flags; - unsigned int mapcount; + struct list_head link; + const char *name; + const struct irq_domain_ops *ops; + void *host_data; + unsigned int flags; + unsigned int mapcount; /* Optional data */ - struct fwnode_handle *fwnode; - enum irq_domain_bus_token bus_token; - struct irq_domain_chip_generic *gc; - struct device *dev; + struct fwnode_handle *fwnode; + enum irq_domain_bus_token bus_token; + struct irq_domain_chip_generic *gc; + struct device *dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - struct irq_domain *parent; + struct irq_domain *parent; #endif /* reverse map data. The linear map gets appended to the irq_domain */ - irq_hw_number_t hwirq_max; - unsigned int revmap_size; - struct radix_tree_root revmap_tree; - struct mutex revmap_mutex; - struct irq_data __rcu *revmap[]; + irq_hw_number_t hwirq_max; + unsigned int revmap_size; + struct radix_tree_root revmap_tree; + struct mutex revmap_mutex; + struct irq_data __rcu *revmap[]; }; /* Irq domain flags */ -- cgit From 6a9fc4190ca23fcf327de666e1a98dbdcdd2d210 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:12 +0100 Subject: genirq/irqdomain: Rename irq_domain::dev to irq_domain:: Pm_dev irq_domain::dev is a misnomer as it's usually the rule that a device pointer points to something which is directly related to the instance. irq_domain::dev can point to some other device for power management to ensure that this underlying device is not powered down when an interrupt is allocated. The upcoming per device MSI domains really require a pointer to the device which instantiated the irq domain and not to some random other device which is required for power management down the chain. Rename irq_domain::dev to irq_domain::pm_dev and fixup the few sites which use that pointer. Conversion was done with the help of coccinelle. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.574541683@linutronix.de --- include/linux/irqdomain.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index b1fdd8d309cf..aa76da82fa36 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -131,7 +131,7 @@ struct irq_domain_chip_generic; * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. - * @dev: Pointer to a device that can be utilized for power management + * @pm_dev: Pointer to a device that can be utilized for power management * purposes related to the irq domain. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains * @@ -153,7 +153,7 @@ struct irq_domain { struct fwnode_handle *fwnode; enum irq_domain_bus_token bus_token; struct irq_domain_chip_generic *gc; - struct device *dev; + struct device *pm_dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; #endif @@ -206,7 +206,7 @@ static inline void irq_domain_set_pm_device(struct irq_domain *d, struct device *dev) { if (d) - d->dev = dev; + d->pm_dev = dev; } #ifdef CONFIG_IRQ_DOMAIN -- cgit From 6b6941f6653ec8017e3822b55bb9eae245f0ed99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:14 +0100 Subject: genirq/msi: Create msi_api.h Create a API header for MSI specific functions which are relevant to device drivers. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.632679220@linutronix.de --- include/linux/msi.h | 6 ++++-- include/linux/msi_api.h | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 include/linux/msi_api.h (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 2d87e000bd45..9f72494dcc3e 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -13,11 +13,14 @@ * * Regular device drivers have no business with any of these functions and * especially storing MSI descriptor pointers in random code is considered - * abuse. The only function which is relevant for drivers is msi_get_virq(). + * abuse. + * + * Device driver relevant functions are available in */ #include #include +#include #include #include #include @@ -188,7 +191,6 @@ struct msi_device_data { int msi_setup_device_data(struct device *dev); -unsigned int msi_get_virq(struct device *dev, unsigned int index); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h new file mode 100644 index 000000000000..57d27cfcde2d --- /dev/null +++ b/include/linux/msi_api.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_MSI_API_H +#define LINUX_MSI_API_H + +/* + * APIs which are relevant for device driver code for allocating and + * freeing MSI interrupts and querying the associations between + * hardware/software MSI indices and the Linux interrupt number. + */ + +struct device; + +unsigned int msi_get_virq(struct device *dev, unsigned int index); + +#endif -- cgit From b749e6d31c88cff2202b968aaba246e5d7379038 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:15 +0100 Subject: genirq/irqdomain: Provide IRQ_DOMAIN_FLAG_MSI_PARENT The new PCI/IMS (Interrupt Message Store) functionality is allowing hardware vendors to provide implementation specific storage for the MSI messages. This can be device memory and also host/guest memory, e.g. in queue memory which is shared with the hardware. This requires device specific MSI interrupt domains, which cannot be achieved by expanding the existing PCI/MSI interrupt domain concept which is a global interrupt domain shared by all PCI devices on a particular (IOMMU) segment: |--- device 1 [Vector]---[Remapping]---[PCI/MSI]--|... |--- device N This works because the PCI/MSI[-X] space is uniform, but falls apart with PCI/IMS which is implementation defined and must be available along with PCI/MSI[-X] on the same device. To support PCI/MSI[-X] plus PCI/IMS on the same device it is required to rework the PCI/MSI interrupt domain hierarchy concept in the following way: |--- [PCI/MSI] device 1 [Vector]---[Remapping]---|... |--- [PCI/MSI] device N That allows in the next step to create multiple interrupt domains per device: |--- [PCI/MSI] device 1 |--- [PCI/IMS] device 1 [Vector]---[Remapping]---|... |--- [PCI/MSI] device N |--- [PCI/IMS] device N So the domain which previously created the global PCI/MSI domain must now act as parent domain for the per device domains. The hierarchy depth is the same as before, but the PCI/MSI domains are then device specific and not longer global. Provide IRQ_DOMAIN_FLAG_MSI_PARENT, which allows to identify these parent domains, along with helpers to query it. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.690038274@linutronix.de --- include/linux/irqdomain.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index aa76da82fa36..f837db9a5339 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -189,6 +189,9 @@ enum { /* Irq domain doesn't translate anything */ IRQ_DOMAIN_FLAG_NO_MAP = (1 << 6), + /* Irq domain is a MSI parent domain */ + IRQ_DOMAIN_FLAG_MSI_PARENT = (1 << 8), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -551,6 +554,11 @@ static inline bool irq_domain_is_msi_remap(struct irq_domain *domain) extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain); +static inline bool irq_domain_is_msi_parent(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_MSI_PARENT; +} + #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) @@ -596,6 +604,12 @@ irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain) { return false; } + +static inline bool irq_domain_is_msi_parent(struct irq_domain *domain) +{ + return false; +} + #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #else /* CONFIG_IRQ_DOMAIN */ -- cgit From e71c5d0bb1885b73c746b634895b85d135ce2c87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:17 +0100 Subject: genirq/irqdomain: Provide IRQ_DOMAIN_FLAG_MSI_DEVICE Similar to marking parent MSI domains it's required to identify per device domains. Add flag and helpers. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.747627287@linutronix.de --- include/linux/irqdomain.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f837db9a5339..24b76688c2fb 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -192,6 +192,9 @@ enum { /* Irq domain is a MSI parent domain */ IRQ_DOMAIN_FLAG_MSI_PARENT = (1 << 8), + /* Irq domain is a MSI device domain */ + IRQ_DOMAIN_FLAG_MSI_DEVICE = (1 << 9), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -559,6 +562,11 @@ static inline bool irq_domain_is_msi_parent(struct irq_domain *domain) return domain->flags & IRQ_DOMAIN_FLAG_MSI_PARENT; } +static inline bool irq_domain_is_msi_device(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_MSI_DEVICE; +} + #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) @@ -610,6 +618,11 @@ static inline bool irq_domain_is_msi_parent(struct irq_domain *domain) return false; } +static inline bool irq_domain_is_msi_device(struct irq_domain *domain) +{ + return false; +} + #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #else /* CONFIG_IRQ_DOMAIN */ -- cgit From f1139f905bd2d8bec59be0c1404b592279ae0ac9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:20 +0100 Subject: genirq/msi: Move xarray into a separate struct and create an array The upcoming support for multiple MSI domains per device requires storage for the MSI descriptors and in a second step storage for the irqdomain pointers. Move the xarray into a separate data structure msi_dev_domain and create an array with size 1 in msi_device_data, which can be expanded later when the support for per device domains is implemented. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.864887773@linutronix.de --- include/linux/msi.h | 13 +++++++++++-- include/linux/msi_api.h | 8 ++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 9f72494dcc3e..f7b9c41a4f54 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -173,19 +173,28 @@ enum msi_desc_filter { MSI_DESC_ASSOCIATED, }; + +/** + * struct msi_dev_domain - The internals of MSI domain info per device + * @store: Xarray for storing MSI descriptor pointers + */ +struct msi_dev_domain { + struct xarray store; +}; + /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers * @platform_data: Platform-MSI specific data * @mutex: Mutex protecting the MSI descriptor store - * @__store: Xarray for storing MSI descriptor pointers + * @__domains: Internal data for per device MSI domains * @__iter_idx: Index to search the next entry for iterators */ struct msi_device_data { unsigned long properties; struct platform_msi_priv_data *platform_data; struct mutex mutex; - struct xarray __store; + struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS]; unsigned long __iter_idx; }; diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h index 57d27cfcde2d..4dbbce68a217 100644 --- a/include/linux/msi_api.h +++ b/include/linux/msi_api.h @@ -10,6 +10,14 @@ struct device; +/* + * Per device interrupt domain related constants. + */ +enum msi_domain_ids { + MSI_DEFAULT_DOMAIN, + MSI_MAX_DEVICE_IRQDOMAINS, +}; + unsigned int msi_get_virq(struct device *dev, unsigned int index); #endif -- cgit From 64258eaa442b0b7d7eac8942cf27863ad9e6028e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:22 +0100 Subject: genirq/msi: Add pointers for per device irq domains With the upcoming per device MSI interrupt domain support it is necessary to store the domain pointers per device. Instead of delegating that storage to device drivers or subsystems add a domain pointer to the msi_dev_domain array in struct msi_device_data. This pointer is also used to take care of tearing down the irq domains when msi_device_data is cleaned up via devres. The interfaces into the MSI core will be changed from irqdomain pointer based interfaces to domain id based interfaces to support multiple MSI domains on a single device (e.g. PCI/MSI[-X] and PCI/IMS. Once the per device domain support is complete the irq domain pointer in struct device::msi.domain will not longer contain a pointer to the "global" MSI domain. It will contain a pointer to the MSI parent domain instead. It would be a horrible maze of conditionals to evaluate all over the place which domain pointer should be used, i.e. the "global" one in device::msi::domain or one from the internal pointer array. To avoid this evaluate in msi_setup_device_data() whether the irq domain which is associated to a device is a "global" or a parent MSI domain. If it is global then copy the pointer into the first entry of the msi_dev_domain array. This allows to convert interfaces and implementation to domain ids while keeping everything existing working. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.923860399@linutronix.de --- include/linux/msi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index f7b9c41a4f54..fdbc80d81e26 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -77,6 +77,7 @@ struct msi_desc; struct pci_dev; struct platform_msi_priv_data; struct device_attribute; +struct irq_domain; void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ @@ -177,9 +178,11 @@ enum msi_desc_filter { /** * struct msi_dev_domain - The internals of MSI domain info per device * @store: Xarray for storing MSI descriptor pointers + * @irqdomain: Pointer to a per device interrupt domain */ struct msi_dev_domain { struct xarray store; + struct irq_domain *domain; }; /** -- cgit From 94ff94cfea2827e287a42781a47f37c9ef82186b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:24 +0100 Subject: genirq/msi: Make MSI descriptor iterators device domain aware To support multiple MSI interrupt domains per device it is necessary to segment the xarray MSI descriptor storage. Each domain gets up to MSI_MAX_INDEX entries. Change the iterators so they operate with domain ids and take the domain offsets into account. The publicly available iterators which are mostly used in legacy implementations and the PCI/MSI core default to MSI_DEFAULT_DOMAIN (0) which is the id for the existing "global" domains. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230313.985498981@linutronix.de --- include/linux/msi.h | 48 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index fdbc80d81e26..764a4e8828fb 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -206,11 +206,48 @@ int msi_setup_device_data(struct device *dev); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); -struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter); -struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); +struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, + enum msi_desc_filter filter); /** - * msi_for_each_desc - Iterate the MSI descriptors + * msi_first_desc - Get the first MSI descriptor of the default irqdomain + * @dev: Device to operate on + * @filter: Descriptor state filter + * + * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs() + * must be invoked before the call. + * + * Return: Pointer to the first MSI descriptor matching the search + * criteria, NULL if none found. + */ +static inline struct msi_desc *msi_first_desc(struct device *dev, + enum msi_desc_filter filter) +{ + return msi_domain_first_desc(dev, MSI_DEFAULT_DOMAIN, filter); +} + +struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, + enum msi_desc_filter filter); + +/** + * msi_domain_for_each_desc - Iterate the MSI descriptors in a specific domain + * + * @desc: struct msi_desc pointer used as iterator + * @dev: struct device pointer - device to iterate + * @domid: The id of the interrupt domain which should be walked. + * @filter: Filter for descriptor selection + * + * Notes: + * - The loop must be protected with a msi_lock_descs()/msi_unlock_descs() + * pair. + * - It is safe to remove a retrieved MSI descriptor in the loop. + */ +#define msi_domain_for_each_desc(desc, dev, domid, filter) \ + for ((desc) = msi_domain_first_desc((dev), (domid), (filter)); (desc); \ + (desc) = msi_next_desc((dev), (domid), (filter))) + +/** + * msi_for_each_desc - Iterate the MSI descriptors in the default irqdomain * * @desc: struct msi_desc pointer used as iterator * @dev: struct device pointer - device to iterate @@ -221,9 +258,8 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); * pair. * - It is safe to remove a retrieved MSI descriptor in the loop. */ -#define msi_for_each_desc(desc, dev, filter) \ - for ((desc) = msi_first_desc((dev), (filter)); (desc); \ - (desc) = msi_next_desc((dev), (filter))) +#define msi_for_each_desc(desc, dev, filter) \ + msi_domain_for_each_desc((desc), (dev), MSI_DEFAULT_DOMAIN, (filter)) #define msi_desc_to_dev(desc) ((desc)->dev) -- cgit From 98043704f375f63a47efeff123ab92fcf34b95e6 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 25 Nov 2022 00:24:25 +0100 Subject: genirq/msi: Make msi_get_virq() device domain aware In preparation of the upcoming per device multi MSI domain support, change the interface to support lookups based on domain id and zero based index within the domain. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.044613697@linutronix.de --- include/linux/msi_api.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h index 4dbbce68a217..8640171288a8 100644 --- a/include/linux/msi_api.h +++ b/include/linux/msi_api.h @@ -18,6 +18,18 @@ enum msi_domain_ids { MSI_MAX_DEVICE_IRQDOMAINS, }; -unsigned int msi_get_virq(struct device *dev, unsigned int index); +unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index); + +/** + * msi_get_virq - Lookup the Linux interrupt number for a MSI index on the default interrupt domain + * @dev: Device for which the lookup happens + * @index: The MSI index to lookup + * + * Return: The Linux interrupt number on success (> 0), 0 if not found + */ +static inline unsigned int msi_get_virq(struct device *dev, unsigned int index) +{ + return msi_domain_get_virq(dev, MSI_DEFAULT_DOMAIN, index); +} #endif -- cgit From 1c89396300292da4e4a87db63bef45975461fa25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:27 +0100 Subject: genirq/msi: Rename msi_add_msi_desc() to msi_insert_msi_desc() This reflects the functionality better. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.103554618@linutronix.de --- include/linux/msi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 764a4e8828fb..f8ba85af3542 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -286,7 +286,7 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, } #endif -int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); +int msi_insert_msi_desc(struct device *dev, struct msi_desc *init_desc); void msi_free_msi_descs_range(struct device *dev, unsigned int first_index, unsigned int last_index); /** -- cgit From fc8ab388325ddfd848d00f3a3383b4304594546a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:29 +0100 Subject: genirq/msi: Make descriptor allocation device domain aware Change the descriptor allocation and insertion functions to take a domain id to prepare for the upcoming multi MSI domain per device support. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.163043028@linutronix.de --- include/linux/msi.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index f8ba85af3542..35e9d0050ca1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -286,7 +286,21 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, } #endif -int msi_insert_msi_desc(struct device *dev, struct msi_desc *init_desc); +int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, + struct msi_desc *init_desc); +/** + * msi_insert_msi_desc - Allocate and initialize a MSI descriptor in the + * default irqdomain and insert it at @init_desc->msi_index + * @dev: Pointer to the device for which the descriptor is allocated + * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor + * + * Return: 0 on success or an appropriate failure code. + */ +static inline int msi_insert_msi_desc(struct device *dev, struct msi_desc *init_desc) +{ + return msi_domain_insert_msi_desc(dev, MSI_DEFAULT_DOMAIN, init_desc); +} + void msi_free_msi_descs_range(struct device *dev, unsigned int first_index, unsigned int last_index); /** -- cgit From 377712c5a45f6de40bffef5ddc31b39cd86cf23f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:30 +0100 Subject: genirq/msi: Make descriptor freeing domain aware Change the descriptor free functions to take a domain id to prepare for the upcoming multi MSI domain per device support. To avoid changing and extending the interfaces over and over use an core internal control struct and hand the pointer through the various functions. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.220788011@linutronix.de --- include/linux/msi.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 35e9d0050ca1..23172d6354ca 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -301,10 +301,25 @@ static inline int msi_insert_msi_desc(struct device *dev, struct msi_desc *init_ return msi_domain_insert_msi_desc(dev, MSI_DEFAULT_DOMAIN, init_desc); } -void msi_free_msi_descs_range(struct device *dev, unsigned int first_index, unsigned int last_index); +void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); /** - * msi_free_msi_descs - Free MSI descriptors of a device + * msi_free_msi_descs_range - Free a range of MSI descriptors of a device + * in the default irqdomain + * + * @dev: Device for which to free the descriptors + * @first: Index to start freeing from (inclusive) + * @last: Last index to be freed (inclusive) + */ +static inline void msi_free_msi_descs_range(struct device *dev, unsigned int first, + unsigned int last) +{ + msi_domain_free_msi_descs_range(dev, MSI_DEFAULT_DOMAIN, first, last); +} + +/** + * msi_free_msi_descs - Free all MSI descriptors of a device in the default irqdomain * @dev: Device to free the descriptors */ static inline void msi_free_msi_descs(struct device *dev) -- cgit From 4cd5f4403f283766f73749c4c936801f58ffe77a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:33 +0100 Subject: genirq/msi: Provide new domain id based interfaces for freeing interrupts Provide two sorts of interfaces to handle the different use cases: - msi_domain_free_irqs_range(): Handles a caller defined precise range - msi_domain_free_irqs_all(): Frees all interrupts associated to a domain The latter is useful for device teardown and to handle the legacy MSI support which does not have any range information available. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.337844751@linutronix.de --- include/linux/msi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 23172d6354ca..74cb0a93de6f 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -498,6 +498,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev); void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); + +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); +void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); + +void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); +void msi_domain_free_irqs_all(struct device *dev, unsigned int domid); + struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode, -- cgit From f2480e7dacdcd9aab25641346ae53b7ff03777fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:35 +0100 Subject: genirq/msi: Provide new domain id allocation functions Provide two sorts of interfaces to handle the different use cases: - msi_domain_alloc_irqs_range(): Handles a caller defined precise range - msi_domain_alloc_irqs_all(): Allocates all interrupts associated to a domain by scanning the allocated MSI descriptors The latter is useful for the existing PCI/MSI support which does not have range information available. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.396497163@linutronix.de --- include/linux/msi.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 74cb0a93de6f..611707d619fc 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -383,8 +383,8 @@ struct msi_domain_info; * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. * - * @msi_check, @msi_prepare and @set_desc are callbacks used by - * msi_domain_alloc/free_irqs(). + * @msi_check, @msi_prepare and @set_desc are callbacks used by the + * msi_domain_alloc/free_irqs*() variants. * * @domain_alloc_irqs, @domain_free_irqs can be used to override the * default allocation/free functions (__msi_domain_alloc/free_irqs). This @@ -392,11 +392,6 @@ struct msi_domain_info; * be wrapped into the regular irq domains concepts by mere mortals. This * allows to universally use msi_domain_alloc/free_irqs without having to * special case XEN all over the place. - * - * Contrary to other operations @domain_alloc_irqs and @domain_free_irqs - * are set to the default implementation if NULL and even when - * MSI_FLAG_USE_DEF_DOM_OPS is not set to avoid breaking existing users and - * because these callbacks are obviously mandatory. */ struct msi_domain_ops { irq_hw_number_t (*get_hwirq)(struct msi_domain_info *info, @@ -496,14 +491,21 @@ int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device int nvec); int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); + void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev); void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); +int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); + + void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); - void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); void msi_domain_free_irqs_all(struct device *dev, unsigned int domid); -- cgit From c459f11f32a022d0f97694030419d16816275a9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:24:43 +0100 Subject: genirq/msi: Remove unused alloc/free interfaces Now that all users are converted remove the old interfaces. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124230314.694291814@linutronix.de --- include/linux/msi.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 611707d619fc..43b8866c8431 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -487,13 +487,6 @@ int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); -int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, - int nvec); -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec); - -void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev); -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); -- cgit From 2d958b02b04f18955b0e15eda531461153c399d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:46 +0100 Subject: genirq/msi: Rearrange MSI domain flags These flags got added as necessary and have no obvious structure. For feature support checks and masking it's convenient to have two blocks of flags: 1) Flags to control the internal behaviour like allocating/freeing MSI descriptors. Those flags do not need any support from the underlying MSI parent domain. They are mostly under the control of the outermost domain which implements the actual MSI support. 2) Flags to expose features, e.g. PCI multi-MSI or requirements which can depend on a underlying domain. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.322714918@linutronix.de --- include/linux/msi.h | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 43b8866c8431..a4339ebcc035 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -24,6 +24,8 @@ #include #include #include +#include + #include /* Dummy shadow structures if an architecture does not define them */ @@ -440,7 +442,16 @@ struct msi_domain_info { void *data; }; -/* Flags for msi_domain_info */ +/* + * Flags for msi_domain_info + * + * Bit 0-15: Generic MSI functionality which is not subject to restriction + * by parent domains + * + * Bit 16-31: Functionality which depends on the underlying parent domain and + * can be masked out by msi_parent_ops::init_dev_msi_info() when + * a device MSI domain is initialized. + */ enum { /* * Init non implemented ops callbacks with default MSI domain @@ -452,33 +463,41 @@ enum { * callbacks. */ MSI_FLAG_USE_DEF_CHIP_OPS = (1 << 1), - /* Support multiple PCI MSI interrupts */ - MSI_FLAG_MULTI_PCI_MSI = (1 << 2), - /* Support PCI MSIX interrupts */ - MSI_FLAG_PCI_MSIX = (1 << 3), /* Needs early activate, required for PCI */ - MSI_FLAG_ACTIVATE_EARLY = (1 << 4), + MSI_FLAG_ACTIVATE_EARLY = (1 << 2), /* * Must reactivate when irq is started even when * MSI_FLAG_ACTIVATE_EARLY has been set. */ - MSI_FLAG_MUST_REACTIVATE = (1 << 5), - /* Is level-triggered capable, using two messages */ - MSI_FLAG_LEVEL_CAPABLE = (1 << 6), + MSI_FLAG_MUST_REACTIVATE = (1 << 3), /* Populate sysfs on alloc() and destroy it on free() */ - MSI_FLAG_DEV_SYSFS = (1 << 7), - /* MSI-X entries must be contiguous */ - MSI_FLAG_MSIX_CONTIGUOUS = (1 << 8), + MSI_FLAG_DEV_SYSFS = (1 << 4), /* Allocate simple MSI descriptors */ - MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 9), + MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 5), /* Free MSI descriptors */ - MSI_FLAG_FREE_MSI_DESCS = (1 << 10), + MSI_FLAG_FREE_MSI_DESCS = (1 << 6), /* * Quirk to handle MSI implementations which do not provide * masking. Currently known to affect x86, but has to be partially * handled in the core MSI code. */ - MSI_FLAG_NOMASK_QUIRK = (1 << 11), + MSI_FLAG_NOMASK_QUIRK = (1 << 7), + + /* Mask for the generic functionality */ + MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), + + /* Mask for the domain specific functionality */ + MSI_DOMAIN_FLAGS_MASK = GENMASK(31, 16), + + /* Support multiple PCI MSI interrupts */ + MSI_FLAG_MULTI_PCI_MSI = (1 << 16), + /* Support PCI MSIX interrupts */ + MSI_FLAG_PCI_MSIX = (1 << 17), + /* Is level-triggered capable, using two messages */ + MSI_FLAG_LEVEL_CAPABLE = (1 << 18), + /* MSI-X entries must be contiguous */ + MSI_FLAG_MSIX_CONTIGUOUS = (1 << 19), + }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit From b78780d93b068706d04f8f2f02bd08db5da01479 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:48 +0100 Subject: genirq/msi: Provide struct msi_parent_ops MSI parent domains must have some control over the MSI domains which are built on top. On domain creation they need to fill in e.g. architecture specific chip callbacks or msi domain ops to make the outermost domain parent agnostic which is obviously required for architecture independence etc. The structure contains: 1) A bitfield which exposes the supported functional features. This allows to check for features and is also used in the initialization callback to mask out unsupported features when the actual domain implementation requests a broader range, e.g. on x86 PCI multi-MSI is only supported by remapping domains but not by the underlying vector domain. The PCI/MSI code can then always request multi-MSI support, but the resulting feature set after creation might not have it set. 2) An optional string prefix which is put in front of domain and chip names during creation of the MSI domain. That allows to keep the naming schemes e.g. on x86 where PCI-MSI domains have a IR- prefix when interrupt remapping is enabled. 3) An initialization callback to sanity check the domain info of the to be created MSI domain, to restrict features and to apply changes in MSI ops and interrupt chip callbacks to accomodate to the particular MSI parent implementation and/or the underlying hierarchy. Add a conveniance function to delegate the initialization from the MSI parent domain to an underlying domain in the hierarchy. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.382485843@linutronix.de --- include/linux/irqdomain.h | 5 +++++ include/linux/msi.h | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 24b76688c2fb..a668cc015874 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -46,6 +46,7 @@ struct irq_desc; struct cpumask; struct seq_file; struct irq_affinity_desc; +struct msi_parent_ops; #define IRQ_DOMAIN_IRQ_SPEC_PARAMS 16 @@ -134,6 +135,7 @@ struct irq_domain_chip_generic; * @pm_dev: Pointer to a device that can be utilized for power management * purposes related to the irq domain. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains + * @msi_parent_ops: Pointer to MSI parent domain methods for per device domain init * * Revmap data, used internally by the irq domain code: * @revmap_size: Size of the linear map table @revmap[] @@ -157,6 +159,9 @@ struct irq_domain { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; #endif +#ifdef CONFIG_GENERIC_MSI_IRQ + const struct msi_parent_ops *msi_parent_ops; +#endif /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; diff --git a/include/linux/msi.h b/include/linux/msi.h index a4339ebcc035..9bf3cbad1114 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -500,6 +500,27 @@ enum { }; +/** + * struct msi_parent_ops - MSI parent domain callbacks and configuration info + * + * @supported_flags: Required: The supported MSI flags of the parent domain + * @prefix: Optional: Prefix for the domain and chip name + * @init_dev_msi_info: Required: Callback for MSI parent domains to setup parent + * domain specific domain flags, domain ops and interrupt chip + * callbacks when a per device domain is created. + */ +struct msi_parent_ops { + u32 supported_flags; + const char *prefix; + bool (*init_dev_msi_info)(struct device *dev, struct irq_domain *domain, + struct irq_domain *msi_parent_domain, + struct msi_domain_info *msi_child_info); +}; + +bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *msi_parent_domain, + struct msi_domain_info *msi_child_info); + int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force); -- cgit From ebca4396ee18521e9e5d435a15e5d0ab2eb6b009 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:49 +0100 Subject: genirq/msi: Provide data structs for per device domains Provide struct msi_domain_template which contains a bundle of struct irq_chip, struct msi_domain_ops and struct msi_domain_info and a name field. This template is used by MSI device domain implementations to provide the domain specific functionality, feature bits etc. When a MSI domain is created the template is duplicated in the core code so that it can be modified per instance. That means templates can be marked const at the MSI device domain code. The template is a bundle to avoid several allocations and duplications of the involved structures. The name field is used to construct the final domain and chip name via: $PREFIX$NAME-$DEVNAME where prefix is the optional prefix of the MSI parent domain, $NAME is the provided name in template::chip and the device name so that the domain is properly identified. On x86 this results for PCI/MSI in: PCI-MSI-0000:3d:00.1 or IR-PCI-MSIX-0000:3d:00.1 depending on the domain type and the availability of remapping. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.442499757@linutronix.de --- include/linux/msi.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 9bf3cbad1114..7fb87371541a 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,6 @@ struct msi_msg { extern int pci_msi_ignore_mask; /* Helper functions */ -struct irq_data; struct msi_desc; struct pci_dev; struct platform_msi_priv_data; @@ -442,6 +442,20 @@ struct msi_domain_info { void *data; }; +/** + * struct msi_domain_template - Template for MSI device domains + * @name: Storage for the resulting name. Filled in by the core. + * @chip: Interrupt chip for this domain + * @ops: MSI domain ops + * @info: MSI domain info data + */ +struct msi_domain_template { + char name[48]; + struct irq_chip chip; + struct msi_domain_ops ops; + struct msi_domain_info info; +}; + /* * Flags for msi_domain_info * -- cgit From 61bf992fc618503c910416f28afa0b015838b72b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:51 +0100 Subject: genirq/msi: Add size info to struct msi_domain_info To allow proper range checking especially for dynamic allocations add a size field to struct msi_domain_info. If the field is 0 then the size is unknown or unlimited (up to MSI_MAX_INDEX) to provide backwards compability. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.501144862@linutronix.de --- include/linux/msi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 7fb87371541a..08a0e2abf048 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -422,6 +422,10 @@ struct msi_domain_ops { * struct msi_domain_info - MSI interrupt domain data * @flags: Flags to decribe features and capabilities * @bus_token: The domain bus token + * @hwsize: The hardware table size or the software index limit. + * If 0 then the size is considered unlimited and + * gets initialized to the maximum software index limit + * by the domain creation code. * @ops: The callback data structure * @chip: Optional: associated interrupt chip * @chip_data: Optional: associated interrupt chip data @@ -433,6 +437,7 @@ struct msi_domain_ops { struct msi_domain_info { u32 flags; enum irq_domain_bus_token bus_token; + unsigned int hwsize; struct msi_domain_ops *ops; struct irq_chip *chip; void *chip_data; -- cgit From 4443664f298d1a2cba25a2e48d53b78f4138209b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:54 +0100 Subject: genirq/irqdomain: Add irq_domain:: Dev for per device MSI domains Per device domains require the device pointer of the device which instantiated the domain for some purposes. Add the pointer to struct irq_domain. It will be used in the next step which provides the infrastructure to create per device MSI domains. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.618807601@linutronix.de --- include/linux/irqdomain.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a668cc015874..a372086750ca 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -132,6 +132,9 @@ struct irq_domain_chip_generic; * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. + * @dev: Pointer to the device which instantiated the irqdomain + * With per device irq domains this is not necessarily the same + * as @pm_dev. * @pm_dev: Pointer to a device that can be utilized for power management * purposes related to the irq domain. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains @@ -155,6 +158,7 @@ struct irq_domain { struct fwnode_handle *fwnode; enum irq_domain_bus_token bus_token; struct irq_domain_chip_generic *gc; + struct device *dev; struct device *pm_dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; -- cgit From 27a6dea3ebaab3d6f8ded969ec3af710bcbe0c02 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:56 +0100 Subject: genirq/msi: Provide msi_create/free_device_irq_domain() Now that all prerequsites are in place, provide the actual interfaces for creating and removing per device interrupt domains. MSI device interrupt domains are created from the provided msi_domain_template which is duplicated so that it can be modified for the particular device. The name of the domain and the name of the interrupt chip are composed by "$(PREFIX)$(CHIPNAME)-$(DEVNAME)" $PREFIX: The optional prefix provided by the underlying MSI parent domain via msi_parent_ops::prefix. $CHIPNAME: The name of the irq_chip in the template $DEVNAME: The name of the device The domain is further initialized through a MSI parent domain callback which fills in the required functionality for the parent domain or domains further down the hierarchy. This initialization can fail, e.g. when the requested feature or MSI domain type cannot be supported. The domain pointer is stored in the pointer array inside of msi_device_data which is attached to the domain. The domain can be removed via the API or left for disposal via devres when the device is torn down. The API removal is useful e.g. for PCI to have seperate domains for MSI and MSI-X, which are mutually exclusive and always occupy the default domain id slot. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.678838546@linutronix.de --- include/linux/msi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 08a0e2abf048..ef46a3ee18e4 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -547,6 +547,12 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); +bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, + const struct msi_domain_template *template, + unsigned int hwsize, void *domain_data, + void *chip_data); +void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); + int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, -- cgit From 26e91b75bf6108550035355c835bf0c93c885b61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:25:57 +0100 Subject: genirq/msi: Provide msi_match_device_domain() Provide an interface to match a per device domain bus token. This allows to query which type of domain is installed for a particular domain id. Will be used for PCI to avoid frequent create/remove cycles for the MSI resp. MSI-X domains. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.738047902@linutronix.de --- include/linux/msi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index ef46a3ee18e4..b4ab00562a29 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -553,6 +553,9 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data); void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); +bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, + enum irq_domain_bus_token bus_token); + int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, -- cgit From 8f8bcc8c720c360885639de66fe69756febed824 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:16 -0400 Subject: vfio/pci: Move all the SPAPR PCI specific logic to vfio_pci_core.ko The vfio_spapr_pci_eeh_open/release() functions are one line wrappers around an arch function. Just call them directly. This eliminates some weird exported symbols that don't need to exist. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 43b67e46a2cb..9378ca79d548 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,21 +233,10 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -struct pci_dev; #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg); #else -static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ -} - -static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ -} - static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) -- cgit From e276e25819b8a173a21947720bb0a548c0b724b7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:18 -0400 Subject: vfio: Move vfio_spapr_iommu_eeh_ioctl into vfio_iommu_spapr_tce.c As with the previous patch EEH is always enabled if SPAPR_TCE_IOMMU, so move this last bit of code into the main module. Now that this function only processes VFIO_EEH_PE_OP remove a level of indenting as well, it is only called by a case statement that already checked VFIO_EEH_PE_OP. This eliminates an unnecessary module and SPAPR code in a global header. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 9378ca79d548..b4d5d4ca3d7d 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,18 +233,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, - unsigned long arg); -#else -static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} -#endif /* CONFIG_VFIO_SPAPR_EEH */ - /* * IRQfd - generic */ -- cgit From 8c045cd21644a0acc815d3108018a8b6fd474804 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Wed, 16 Nov 2022 13:30:05 +0200 Subject: soc: qcom: llcc: Add configuration data for SM8550 Add LLCC configuration data for SM8550 SoC. Signed-off-by: Abel Vesa Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20221116113005.2653284-4-abel.vesa@linaro.org --- include/linux/soc/qcom/llcc-qcom.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index bc2fb8343a94..ad1fd718169d 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -42,7 +42,19 @@ #define LLCC_CPUHWT 36 #define LLCC_MDMCLAD2 37 #define LLCC_CAMEXP1 38 +#define LLCC_CMPTHCP 39 +#define LLCC_LCPDARE 40 #define LLCC_AENPU 45 +#define LLCC_ISLAND1 46 +#define LLCC_ISLAND2 47 +#define LLCC_ISLAND3 48 +#define LLCC_ISLAND4 49 +#define LLCC_CAMEXP2 50 +#define LLCC_CAMEXP3 51 +#define LLCC_CAMEXP4 52 +#define LLCC_DISP_WB 53 +#define LLCC_DISP_1 54 +#define LLCC_VIDVSP 64 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit From bd141a3db40c877e01de8e981edb57c03199d876 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:02 +0100 Subject: genirq/msi: Provide BUS_DEVICE_PCI_MSI[X] Provide new bus tokens for the upcoming per device PCI/MSI and PCI/MSIX interrupt domains. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232325.917219885@linutronix.de --- include/linux/irqdomain_defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index 69035b4f740a..b3f4b7ef31f1 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -21,6 +21,8 @@ enum irq_domain_bus_token { DOMAIN_BUS_TI_SCI_INTA_MSI, DOMAIN_BUS_WAKEUP, DOMAIN_BUS_VMD_MSI, + DOMAIN_BUS_PCI_DEVICE_MSI, + DOMAIN_BUS_PCI_DEVICE_MSIX, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- cgit From 45c0402457c1ed2f07ee32dc129ae710e0dc288c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:07 +0100 Subject: PCI/MSI: Remove unused pci_dev_has_special_msi_domain() The check for special MSI domains like VMD which prevents the interrupt remapping code to overwrite device::msi::domain is not longer required and has been replaced by an x86 specific version which is aware of MSI parent domains. Remove it. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.093093200@linutronix.de --- include/linux/msi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b4ab00562a29..b5dda4bf4156 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -617,7 +617,6 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); -bool pci_dev_has_special_msi_domain(struct pci_dev *pdev); #else /* CONFIG_PCI_MSI */ static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) { -- cgit From 9a945234abea27d45f8d89e1a1b35ab5bf41dd01 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:08 +0100 Subject: iommu/vt-d: Switch to MSI parent domains Remove the global PCI/MSI irqdomain implementation and provide the required MSI parent ops so the PCI/MSI code can detect the new parent and setup per device domains. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.151226317@linutronix.de --- include/linux/irqdomain_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index b3f4b7ef31f1..3a09396ac88d 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -23,6 +23,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_VMD_MSI, DOMAIN_BUS_PCI_DEVICE_MSI, DOMAIN_BUS_PCI_DEVICE_MSIX, + DOMAIN_BUS_DMAR, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- cgit From cc7594ffadde77e2825faf1c576230530c829bc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:10 +0100 Subject: iommu/amd: Switch to MSI base domains Remove the global PCI/MSI irqdomain implementation and provide the required MSI parent ops so the PCI/MSI code can detect the new parent and setup per device domains. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.209212272@linutronix.de --- include/linux/irqdomain_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index 3a09396ac88d..0b2d8a83e0d8 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -24,6 +24,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_PCI_DEVICE_MSI, DOMAIN_BUS_PCI_DEVICE_MSIX, DOMAIN_BUS_DMAR, + DOMAIN_BUS_AMDVI, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- cgit From 06bff9e347271566e8dd79e7c3eb971660209a00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:13 +0100 Subject: genirq/msi: Provide struct msi_map A simple struct to hold a MSI index / Linux interrupt number pair. It will be returned from the dynamic vector allocation function and handed back to the corresponding free() function. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.326410494@linutronix.de --- include/linux/msi_api.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h index 8640171288a8..4cb7f4c9a3ae 100644 --- a/include/linux/msi_api.h +++ b/include/linux/msi_api.h @@ -18,6 +18,19 @@ enum msi_domain_ids { MSI_MAX_DEVICE_IRQDOMAINS, }; +/** + * msi_map - Mapping between MSI index and Linux interrupt number + * @index: The MSI index, e.g. slot in the MSI-X table or + * a software managed index if >= 0. If negative + * the allocation function failed and it contains + * the error code. + * @virq: The associated Linux interrupt number + */ +struct msi_map { + int index; + int virq; +}; + unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index); /** -- cgit From efd42049657e958797a483f793e4064042faa49c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:15 +0100 Subject: genirq/msi: Provide msi_desc:: Msi_data The upcoming support for PCI/IMS requires to store some information related to the message handling in the MSI descriptor, e.g. PASID or a pointer to a queue. Provide a generic storage struct which maps over the existing PCI specific storage which means the size of struct msi_desc is not getting bigger. This storage struct has two elements: 1) msi_domain_cookie 2) msi_instance_cookie The domain cookie is going to be used to store domain specific information, e.g. iobase pointer, data pointer. The instance cookie is going to be handed in when allocating an interrupt on an IMS domain so the irq chip callbacks of the IMS domain have the necessary per vector information available. It also comes in handy when cleaning up the platform MSI code for wire to MSI bridges which need to hand down the type information to the underlying interrupt domain. For the core code the cookies are opaque and meaningless. It just stores the instance cookie during an allocation through the upcoming interfaces for IMS and wire to MSI brigdes. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.385036043@linutronix.de --- include/linux/msi.h | 38 +++++++++++++++++++++++++++++++++++++- include/linux/msi_api.h | 17 +++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b5dda4bf4156..dca3b801428f 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -125,6 +125,38 @@ struct pci_msi_desc { }; }; +/** + * union msi_domain_cookie - Opaque MSI domain specific data + * @value: u64 value store + * @ptr: Pointer to domain specific data + * @iobase: Domain specific IOmem pointer + * + * The content of this data is implementation defined and used by the MSI + * domain to store domain specific information which is requried for + * interrupt chip callbacks. + */ +union msi_domain_cookie { + u64 value; + void *ptr; + void __iomem *iobase; +}; + +/** + * struct msi_desc_data - Generic MSI descriptor data + * @dcookie: Cookie for MSI domain specific data which is required + * for irq_chip callbacks + * @icookie: Cookie for the MSI interrupt instance provided by + * the usage site to the allocation function + * + * The content of this data is implementation defined, e.g. PCI/IMS + * implementations define the meaning of the data. The MSI core ignores + * this data completely. + */ +struct msi_desc_data { + union msi_domain_cookie dcookie; + union msi_instance_cookie icookie; +}; + #define MSI_MAX_INDEX ((unsigned int)USHRT_MAX) /** @@ -142,6 +174,7 @@ struct pci_msi_desc { * * @msi_index: Index of the msi descriptor * @pci: PCI specific msi descriptor data + * @data: Generic MSI descriptor data */ struct msi_desc { /* Shared device/bus type independent data */ @@ -161,7 +194,10 @@ struct msi_desc { void *write_msi_msg_data; u16 msi_index; - struct pci_msi_desc pci; + union { + struct pci_msi_desc pci; + struct msi_desc_data data; + }; }; /* diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h index 4cb7f4c9a3ae..2e4456ec9f6b 100644 --- a/include/linux/msi_api.h +++ b/include/linux/msi_api.h @@ -18,6 +18,23 @@ enum msi_domain_ids { MSI_MAX_DEVICE_IRQDOMAINS, }; +/** + * union msi_instance_cookie - MSI instance cookie + * @value: u64 value store + * @ptr: Pointer to usage site specific data + * + * This cookie is handed to the IMS allocation function and stored in the + * MSI descriptor for the interrupt chip callbacks. + * + * The content of this cookie is MSI domain implementation defined. For + * PCI/IMS implementations this could be a PASID or a pointer to queue + * memory. + */ +union msi_instance_cookie { + u64 value; + void *ptr; +}; + /** * msi_map - Mapping between MSI index and Linux interrupt number * @index: The MSI index, e.g. slot in the MSI-X table or -- cgit From 8f986fd7755bec8b8c5776824afa1bd1151986d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:16 +0100 Subject: genirq/msi: Provide msi_domain_ops:: Prepare_desc() The existing MSI domain ops msi_prepare() and set_desc() turned out to be unsuitable for implementing IMS support. msi_prepare() does not operate on the MSI descriptors. set_desc() lacks an irq_domain pointer and has a completely different purpose. Introduce a prepare_desc() op which allows IMS implementations to amend an MSI descriptor which was allocated by the core code, e.g. by adjusting the iomem base or adding some data based on the allocated index. This is way better than requiring that all IMS domain implementations preallocate the MSI descriptor and then allocate the interrupt. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.444560717@linutronix.de --- include/linux/msi.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index dca3b801428f..cb0bee3f43a7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -410,6 +410,8 @@ struct msi_domain_info; * @msi_init: Domain specific init function for MSI interrupts * @msi_free: Domain specific function to free a MSI interrupts * @msi_prepare: Prepare the allocation of the interrupts in the domain + * @prepare_desc: Optional function to prepare the allocated MSI descriptor + * in the domain * @set_desc: Set the msi descriptor for an interrupt * @domain_alloc_irqs: Optional function to override the default allocation * function. @@ -421,7 +423,7 @@ struct msi_domain_info; * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. * - * @msi_check, @msi_prepare and @set_desc are callbacks used by the + * @msi_check, @msi_prepare, @prepare_desc and @set_desc are callbacks used by the * msi_domain_alloc/free_irqs*() variants. * * @domain_alloc_irqs, @domain_free_irqs can be used to override the @@ -444,6 +446,8 @@ struct msi_domain_ops { int (*msi_prepare)(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); + void (*prepare_desc)(struct irq_domain *domain, msi_alloc_info_t *arg, + struct msi_desc *desc); void (*set_desc)(msi_alloc_info_t *arg, struct msi_desc *desc); int (*domain_alloc_irqs)(struct irq_domain *domain, -- cgit From 3d393b21740bffbeeae7d4fa534a6b16c3e3e832 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:18 +0100 Subject: genirq/msi: Provide msi_domain_alloc_irq_at() For supporting post MSI-X enable allocations and for the upcoming PCI/IMS support a separate interface is required which allows not only the allocation of a specific index, but also the allocation of any, i.e. the next free index. The latter is especially required for IMS because IMS completely does away with index to functionality mappings which are often found in MSI/MSI-X implementation. But even with MSI-X there are devices where only the first few indices have a fixed functionality and the rest is freely assignable by software, e.g. to queues. msi_domain_alloc_irq_at() is also different from the range based interfaces as it always enforces that the MSI descriptor is allocated by the core code and not preallocated by the caller like the PCI/MSI[-X] enable code path does. msi_domain_alloc_irq_at() can be invoked with the index argument set to MSI_ANY_INDEX which makes the core code pick the next free index. The irq domain can provide a prepare_desc() operation callback in it's msi_domain_ops to do domain specific post allocation initialization before the actual Linux interrupt and the associated interrupt descriptor and hierarchy alloccations are conducted. The function also takes an optional @icookie argument which is of type union msi_instance_cookie. This cookie is not used by the core code and is stored in the allocated msi_desc::data::icookie. The meaning of the cookie is completely implementation defined. In case of IMS this might be a PASID or a pointer to a device queue, but for the MSI core it's opaque and not used in any way. The function returns a struct msi_map which on success contains the allocated index number and the Linux interrupt number so the caller can spare the index to Linux interrupt number lookup. On failure map::index contains the error code and map::virq is 0. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.501359457@linutronix.de --- include/linux/msi.h | 4 ++++ include/linux/msi_api.h | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index cb0bee3f43a7..00c501979ff1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -80,6 +80,7 @@ struct pci_dev; struct platform_msi_priv_data; struct device_attribute; struct irq_domain; +struct irq_affinity_desc; void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ @@ -602,6 +603,9 @@ int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); +struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index, + const struct irq_affinity_desc *affdesc, + union msi_instance_cookie *cookie); void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h index 2e4456ec9f6b..5ae72d1912c4 100644 --- a/include/linux/msi_api.h +++ b/include/linux/msi_api.h @@ -48,6 +48,13 @@ struct msi_map { int virq; }; +/* + * Constant to be used for dynamic allocations when the allocation is any + * free MSI index, which is either an entry in a hardware table or a + * software managed index. + */ +#define MSI_ANY_INDEX UINT_MAX + unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index); /** -- cgit From b834e3c08fc6c4460c2bce6575cba4705f6301e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:20 +0100 Subject: genirq/msi: Provide MSI_FLAG_MSIX_ALLOC_DYN Provide a new MSI feature flag in preparation for dynamic MSIX allocation after the initial MSI-X enable has been done. This needs to be an explicit MSI interrupt domain feature because quite some implementations (both interrupt domains and legacy allocation mode) have clear expectations that the allocation code is only invoked when MSI-X is about to be enabled. They either talk to hypervisors or do some other work and are not prepared to be invoked on an already MSI-X enabled device. This is also explicit MSI-X only because rewriting the size of the MSI entries is only possible when disabling MSI which in turn might cause lost interrupts on the device. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.558843119@linutronix.de --- include/linux/msi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 00c501979ff1..3cb15866ffbf 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -557,7 +557,8 @@ enum { MSI_FLAG_LEVEL_CAPABLE = (1 << 18), /* MSI-X entries must be contiguous */ MSI_FLAG_MSIX_CONTIGUOUS = (1 << 19), - + /* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */ + MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20), }; /** -- cgit From 34026364df8eca05ee32e706a2c014511a19af02 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:24 +0100 Subject: PCI/MSI: Provide post-enable dynamic allocation interfaces for MSI-X MSI-X vectors can be allocated after the initial MSI-X enablement, but this needs explicit support of the underlying interrupt domains. Provide a function to query the ability and functions to allocate/free individual vectors post-enable. The allocation can either request a specific index in the MSI-X table or with the index argument MSI_ANY_INDEX it allocates the next free vector. The return value is a struct msi_map which on success contains both index and the Linux interrupt number. In case of failure index is negative and the Linux interrupt number is 0. The allocation function is for a single MSI-X index at a time as that's sufficient for the most urgent use case VFIO to get rid of the 'disable MSI-X, reallocate, enable-MSI-X' cycle which is prone to lost interrupts and redirections to the legacy and obviously unhandled INTx. As single index allocation is also sufficient for the use cases Jason Gunthorpe pointed out: Allocation of a MSI-X or IMS vector for a network queue. See Link below. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Link: https://lore.kernel.org/all/20211126232735.547996838@linutronix.de Link: https://lore.kernel.org/r/20221124232326.731233614@linutronix.de --- include/linux/pci.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 243e48f79e82..68b14ba93df2 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -1559,6 +1560,11 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags, struct irq_affinity *affd); +bool pci_msix_can_alloc_dyn(struct pci_dev *dev); +struct msi_map pci_msix_alloc_irq_at(struct pci_dev *dev, unsigned int index, + const struct irq_affinity_desc *affdesc); +void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map); + void pci_free_irq_vectors(struct pci_dev *dev); int pci_irq_vector(struct pci_dev *dev, unsigned int nr); const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec); -- cgit From e23d4192bf9b612bce5b24f22719fd3cc6edaa69 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:28 +0100 Subject: genirq/msi: Provide constants for PCI/IMS support Provide the necessary constants for PCI/IMS support: - A new bus token for MSI irqdomain identification - A MSI feature flag for the MSI irqdomains to signal support - A secondary domain id The latter expands the device internal domain pointer storage array from 1 to 2 entries. That extra pointer is mostly unused today, but the alternative solutions would not be free either and would introduce more complexity all over the place. Trade the 8bytes for simplicity. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.846169830@linutronix.de --- include/linux/irqdomain_defs.h | 1 + include/linux/msi.h | 2 ++ include/linux/msi_api.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index 0b2d8a83e0d8..c29921fd8cd1 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -25,6 +25,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_PCI_DEVICE_MSIX, DOMAIN_BUS_DMAR, DOMAIN_BUS_AMDVI, + DOMAIN_BUS_PCI_DEVICE_IMS, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ diff --git a/include/linux/msi.h b/include/linux/msi.h index 3cb15866ffbf..a112b913fff9 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -559,6 +559,8 @@ enum { MSI_FLAG_MSIX_CONTIGUOUS = (1 << 19), /* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */ MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20), + /* Support for PCI/IMS */ + MSI_FLAG_PCI_IMS = (1 << 21), }; /** diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h index 5ae72d1912c4..391087ad99b1 100644 --- a/include/linux/msi_api.h +++ b/include/linux/msi_api.h @@ -15,6 +15,7 @@ struct device; */ enum msi_domain_ids { MSI_DEFAULT_DOMAIN, + MSI_SECONDARY_DOMAIN, MSI_MAX_DEVICE_IRQDOMAINS, }; -- cgit From 0194425af0c87acaad457989a2c6d90dba58e776 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:29 +0100 Subject: PCI/MSI: Provide IMS (Interrupt Message Store) support IMS (Interrupt Message Store) is a new specification which allows implementation specific storage of MSI messages contrary to the strict standard specified MSI and MSI-X message stores. This requires new device specific interrupt domains to handle the implementation defined storage which can be an array in device memory or host/guest memory which is shared with hardware queues. Add a function to create IMS domains for PCI devices. IMS domains are using the new per device domain mechanism and are configured by the device driver via a template. IMS domains are created as secondary device domains so they work side on side with MSI[-X] on the same device. The IMS domains have a few constraints: - The index space is managed by the core code. Device memory based IMS provides a storage array with a fixed size which obviously requires an index. But there is no association between index and functionality so the core can randomly allocate an index in the array. System memory based IMS does not have the concept of an index as the storage is somewhere in memory. In that case the index is purely software based to keep track of the allocations. - There is no requirement for consecutive index ranges This is currently a limitation of the MSI core and can be implemented if there is a justified use case by changing the internal storage from xarray to maple_tree. For now it's single vector allocation. - The interrupt chip must provide the following callbacks: - irq_mask() - irq_unmask() - irq_write_msi_msg() - The interrupt chip must provide the following optional callbacks when the irq_mask(), irq_unmask() and irq_write_msi_msg() callbacks cannot operate directly on hardware, e.g. in the case that the interrupt message store is in queue memory: - irq_bus_lock() - irq_bus_unlock() These callbacks are invoked from preemptible task context and are allowed to sleep. In this case the mandatory callbacks above just store the information. The irq_bus_unlock() callback is supposed to make the change effective before returning. - Interrupt affinity setting is handled by the underlying parent interrupt domain and communicated to the IMS domain via irq_write_msi_msg(). IMS domains cannot have a irq_set_affinity() callback. That's a reasonable restriction similar to the PCI/MSI device domain implementations. The domain is automatically destroyed when the PCI device is removed. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.904316841@linutronix.de --- include/linux/pci.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 68b14ba93df2..1592b630d919 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2487,6 +2487,11 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev) void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #endif +struct msi_domain_template; + +bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template, + unsigned int hwsize, void *data); + #include #define pci_printk(level, pdev, fmt, arg...) \ -- cgit From c9e5bea273834a63b5e9ba90ad94b305ba50704e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Nov 2022 00:26:31 +0100 Subject: PCI/MSI: Provide pci_ims_alloc/free_irq() Single vector allocation which allocates the next free index in the IMS space. The free function releases. All allocated vectors are released also via pci_free_vectors() which is also releasing MSI/MSI-X vectors. Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Tian Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221124232326.961711347@linutronix.de --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1592b630d919..aa514b54c681 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2491,6 +2491,9 @@ struct msi_domain_template; bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template, unsigned int hwsize, void *data); +struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie, + const struct irq_affinity_desc *affdesc); +void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map); #include -- cgit From 6c452cff79f8bf1c0146fda598d32061cfd25443 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 2 Dec 2022 19:35:26 +0100 Subject: pwm: Make .get_state() callback return an error code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .get_state() might fail in some cases. To make it possible that a driver signals such a failure change the prototype of .get_state() to return an error code. This patch was created using coccinelle and the following semantic patch: @p1@ identifier getstatefunc; identifier driver; @@ struct pwm_ops driver = { ..., .get_state = getstatefunc ,... }; @p2@ identifier p1.getstatefunc; identifier chip, pwm, state; @@ -void +int getstatefunc(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_state *state) { ... - return; + return 0; ... } plus the actual change of the prototype in include/linux/pwm.h (plus some manual fixing of indentions and empty lines). So for now all drivers return success unconditionally. They are adapted in the following patches to make the changes easier reviewable. Reviewed-by: Heiko Stuebner Reviewed-by: Baolin Wang Reviewed-by: Tzung-Bi Shih Reviewed-by: Neil Armstrong Reviewed-by: Nobuhiro Iwamatsu Reviewed-by: Andre Przywara Reviewed-by: Dave Stevenson Acked-by: Douglas Anderson Acked-by: Jernej Skrabec Acked-by: Pavel Machek Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221130152148.2769768-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index d70c6e5a839d..4de09163c968 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -276,8 +276,8 @@ struct pwm_ops { struct pwm_capture *result, unsigned long timeout); int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state); - void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state); + int (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state); struct module *owner; }; -- cgit From 3afee4ed336ed7a6cff78d23339879ffb654e02e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 22:10:23 +0200 Subject: net/mlx5: Add HW definitions for IPsec packet offload Add all needed bits to support IPsec packet offload mode. Reviewed-by: Raed Salem Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/mlx5/mlx5_ifc.h | 53 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..300b56ea5ff4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -445,7 +445,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 max_modify_header_actions[0x8]; u8 max_ft_level[0x8]; - u8 reserved_at_40[0x6]; + u8 reformat_add_esp_trasport[0x1]; + u8 reserved_at_41[0x2]; + u8 reformat_del_esp_trasport[0x1]; + u8 reserved_at_44[0x2]; u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; @@ -638,8 +641,10 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 reserved_at_1a0[0x8]; u8 macsec_syndrome[0x8]; + u8 ipsec_syndrome[0x8]; + u8 reserved_at_1b8[0x8]; - u8 reserved_at_1b0[0x50]; + u8 reserved_at_1c0[0x40]; }; struct mlx5_ifc_fte_match_set_misc3_bits { @@ -6384,6 +6389,9 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2, MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3, MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5, + MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, @@ -11563,6 +11571,41 @@ enum { MLX5_IPSEC_OBJECT_ICV_LEN_16B, }; +enum { + MLX5_IPSEC_ASO_REG_C_0_1 = 0x0, + MLX5_IPSEC_ASO_REG_C_2_3 = 0x1, + MLX5_IPSEC_ASO_REG_C_4_5 = 0x2, + MLX5_IPSEC_ASO_REG_C_6_7 = 0x3, +}; + +enum { + MLX5_IPSEC_ASO_MODE = 0x0, + MLX5_IPSEC_ASO_REPLAY_PROTECTION = 0x1, + MLX5_IPSEC_ASO_INC_SN = 0x2, +}; + +struct mlx5_ifc_ipsec_aso_bits { + u8 valid[0x1]; + u8 reserved_at_201[0x1]; + u8 mode[0x2]; + u8 window_sz[0x2]; + u8 soft_lft_arm[0x1]; + u8 hard_lft_arm[0x1]; + u8 remove_flow_enable[0x1]; + u8 esn_event_arm[0x1]; + u8 reserved_at_20a[0x16]; + + u8 remove_flow_pkt_cnt[0x20]; + + u8 remove_flow_soft_lft[0x20]; + + u8 reserved_at_260[0x80]; + + u8 mode_parameter[0x20]; + + u8 replay_protection_window[0x100]; +}; + struct mlx5_ifc_ipsec_obj_bits { u8 modify_field_select[0x40]; u8 full_offload[0x1]; @@ -11584,7 +11627,11 @@ struct mlx5_ifc_ipsec_obj_bits { u8 implicit_iv[0x40]; - u8 reserved_at_100[0x700]; + u8 reserved_at_100[0x8]; + u8 ipsec_aso_access_pd[0x18]; + u8 reserved_at_120[0xe0]; + + struct mlx5_ifc_ipsec_aso_bits ipsec_aso; }; struct mlx5_ifc_create_ipsec_obj_in_bits { -- cgit From 64f6a5d1922bf6d2b2d845de20d4563a6f328e2d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 5 Dec 2022 13:12:03 +0100 Subject: container_of: add container_of_const() that preserves const-ness of the pointer container_of does not preserve the const-ness of a pointer that is passed into it, which can cause C code that passes in a const pointer to get a pointer back that is not const and then scribble all over the data in it. To prevent this, container_of_const() will preserve the const status of the pointer passed into it using the newly available _Generic() method. Suggested-by: Jason Gunthorpe Suggested-by: Sakari Ailus Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Jason Gunthorpe Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221205121206.166576-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/container_of.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/container_of.h b/include/linux/container_of.h index 2008e9f4058c..1d898f9158b4 100644 --- a/include/linux/container_of.h +++ b/include/linux/container_of.h @@ -22,4 +22,17 @@ "pointer type mismatch in container_of()"); \ ((type *)(__mptr - offsetof(type, member))); }) +/** + * container_of_const - cast a member of a structure out to the containing + * structure and preserve the const-ness of the pointer + * @ptr: the pointer to the member + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + */ +#define container_of_const(ptr, type, member) \ + _Generic(ptr, \ + const typeof(*(ptr)) *: ((const type *)container_of(ptr, type, member)),\ + default: ((type *)container_of(ptr, type, member)) \ + ) + #endif /* _LINUX_CONTAINER_OF_H */ -- cgit From 6149f83b3165955e9519de483b30b26d1518ad0f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 5 Dec 2022 13:12:04 +0100 Subject: device.h: move kobj_to_dev() to use container_of_const() Instead of rolling our own const-checking logic, use the newly introduced container_of_const() to handle it all for us automatically. Cc: Thomas Gleixner Reviewed-by: Sakari Ailus Reviewed-by: Jason Gunthorpe Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20221205121206.166576-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 84ae52de6746..8d172d06b8c1 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -680,26 +680,7 @@ struct device_link { bool supplier_preactivated; /* Owned by consumer probe. */ }; -static inline struct device *__kobj_to_dev(struct kobject *kobj) -{ - return container_of(kobj, struct device, kobj); -} - -static inline const struct device *__kobj_to_dev_const(const struct kobject *kobj) -{ - return container_of(kobj, const struct device, kobj); -} - -/* - * container_of() will happily take a const * and spit back a non-const * as it - * is just doing pointer math. But we want to be a bit more careful in the - * driver code, so manually force any const * of a kobject to also be a const * - * to a device. - */ -#define kobj_to_dev(kobj) \ - _Generic((kobj), \ - const struct kobject *: __kobj_to_dev_const, \ - struct kobject *: __kobj_to_dev)(kobj) +#define kobj_to_dev(__kobj) container_of_const(__kobj, struct device, kobj) /** * device_iommu_mapped - Returns true when the device DMA is translated -- cgit From c3da679286bee1d897bb24a804cca4ff58781bec Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 5 Dec 2022 13:12:05 +0100 Subject: usb.h: take advantage of container_of_const() Instead of rolling our own const-checking logic in to_usb_interface() and to_usb_device() use the newly added container_of_const() instead, making the logic much simpler overall. Reviewed-by: Sakari Ailus Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221205121206.166576-3-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 42 ++---------------------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 4b463a5e4ba2..010c681b8822 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -259,26 +259,7 @@ struct usb_interface { struct work_struct reset_ws; /* for resets in atomic context */ }; -static inline struct usb_interface *__to_usb_interface(struct device *d) -{ - return container_of(d, struct usb_interface, dev); -} - -static inline const struct usb_interface *__to_usb_interface_const(const struct device *d) -{ - return container_of(d, struct usb_interface, dev); -} - -/* - * container_of() will happily take a const * and spit back a non-const * as it - * is just doing pointer math. But we want to be a bit more careful in the USB - * driver code, so manually force any const * of a device to also be a const * - * to a usb_device. - */ -#define to_usb_interface(dev) \ - _Generic((dev), \ - const struct device *: __to_usb_interface_const, \ - struct device *: __to_usb_interface)(dev) +#define to_usb_interface(__dev) container_of_const(__dev, struct usb_interface, dev) static inline void *usb_get_intfdata(struct usb_interface *intf) { @@ -730,26 +711,7 @@ struct usb_device { unsigned use_generic_driver:1; }; -static inline struct usb_device *__to_usb_device(struct device *d) -{ - return container_of(d, struct usb_device, dev); -} - -static inline const struct usb_device *__to_usb_device_const(const struct device *d) -{ - return container_of(d, struct usb_device, dev); -} - -/* - * container_of() will happily take a const * and spit back a non-const * as it - * is just doing pointer math. But we want to be a bit more careful in the USB - * driver code, so manually force any const * of a device to also be a const * - * to a usb_device. - */ -#define to_usb_device(dev) \ - _Generic((dev), \ - const struct device *: __to_usb_device_const, \ - struct device *: __to_usb_device)(dev) +#define to_usb_device(__dev) container_of_const(__dev, struct usb_device, dev) static inline struct usb_device *__intf_to_usbdev(struct usb_interface *intf) { -- cgit From db1c7d77976775483a8ef240b4c705f113e13ea1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Dec 2022 15:44:07 +0100 Subject: block: bio_copy_data_iter With the pktcdvdv removal, bio_copy_data_iter is unused now. Fold the logic into bio_copy_data and remove the separate lower level function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221206144407.722049-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 2c5806997bbf..b231a665682a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,8 +475,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, - struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -- cgit From c943a9374d12bebca2d0de7e273b1c723b58c122 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:25 +0200 Subject: net/mlx5: Introduce ifc bits for pre_copy Introduce ifc related stuff to enable PRE_COPY of VF during migration. Signed-off-by: Shay Drory Acked-by: Leon Romanovsky Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..230a96626a5f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1882,7 +1882,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 max_reformat_remove_size[0x8]; u8 max_reformat_remove_offset[0x8]; - u8 reserved_at_c0[0xe0]; + u8 reserved_at_c0[0x8]; + u8 migration_multi_load[0x1]; + u8 migration_tracking_state[0x1]; + u8 reserved_at_ca[0x16]; + + u8 reserved_at_e0[0xc0]; u8 reserved_at_1a0[0xb]; u8 log_min_mkey_entity_size[0x5]; @@ -11918,7 +11923,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x10]; + u8 incremental[0x1]; + u8 reserved_at_41[0xf]; u8 vhca_id[0x10]; u8 reserved_at_60[0x20]; @@ -11944,7 +11950,9 @@ struct mlx5_ifc_save_vhca_state_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x10]; + u8 incremental[0x1]; + u8 set_track[0x1]; + u8 reserved_at_42[0xe]; u8 vhca_id[0x10]; u8 reserved_at_60[0x20]; -- cgit From 361dd63ed5f1103ff21e22408a142a7d40804527 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 28 Nov 2022 15:42:39 +0100 Subject: lsm: Clarify documentation of vm_enough_memory hook include/linux/lsm_hooks.h reports the result of the LSM infrastructure to the callers, not what LSMs should return to the LSM infrastructure. Clarify that and add that if all LSMs return a positive value __vm_enough_memory() will be called with cap_sys_admin set. If at least one LSM returns 0 or negative, it will be called with cap_sys_admin cleared. Signed-off-by: Roberto Sassu Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 2831efebde69..c35e260efd8c 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1398,7 +1398,11 @@ * Check permissions for allocating a new virtual mapping. * @mm contains the mm struct it is being added to. * @pages contains the number of pages. - * Return 0 if permission is granted. + * Return 0 if permission is granted by the LSM infrastructure to the + * caller. If all LSMs return a positive value, __vm_enough_memory() will + * be called with cap_sys_admin set. If at least one LSM returns 0 or + * negative, __vm_enough_memory() will be called with cap_sys_admin + * cleared. * * @ismaclabel: * Check if the extended attribute specified by @name -- cgit From 8eb687bc806932fc65de4cf60c4ecf913182231d Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 28 Nov 2022 15:42:40 +0100 Subject: lsm: Add/fix return values in lsm_hooks.h and fix formatting Ensure that for non-void LSM hooks there is a description of the return values. Also, replace spaces with tab for indentation, remove empty lines between the hook description and the list of parameters, adjust semicolons and add the period at the end of the parameter description. Finally, move the description of gfp parameter of the xfrm_policy_alloc_security hook together with the others. Signed-off-by: Roberto Sassu [PM: /replaces./replaced./] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 221 +++++++++++++++++++++++++++++----------------- 1 file changed, 138 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index c35e260efd8c..bad3b6baad86 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -92,13 +92,17 @@ * is initialised to NULL by the caller. * @fc indicates the new filesystem context. * @src_fc indicates the original filesystem context. + * Return 0 on success or a negative error code on failure. * @fs_context_parse_param: * Userspace provided a parameter to configure a superblock. The LSM may * reject it with an error and may use it for itself, in which case it * should return 0; otherwise it should return -ENOPARAM to pass it on to * the filesystem. * @fc indicates the filesystem context. - * @param The parameter + * @param The parameter. + * Return 0 to indicate that the parameter should be passed on to the + * filesystem, 1 to indicate that the parameter should be discarded or an + * error to indicate that the parameter should be rejected. * * Security hooks for filesystem operations. * @@ -118,6 +122,7 @@ * Free memory associated with @mnt_ops. * @sb_eat_lsm_opts: * Eat (scan @orig options) and save them in @mnt_opts. + * Return 0 on success, negative values on failure. * @sb_statfs: * Check permission before obtaining filesystem statistics for the @mnt * mountpoint. @@ -139,19 +144,21 @@ * @sb_mnt_opts_compat: * Determine if the new mount options in @mnt_opts are allowed given * the existing mounted filesystem at @sb. - * @sb superblock being compared - * @mnt_opts new mount options + * @sb superblock being compared. + * @mnt_opts new mount options. * Return 0 if options are compatible. * @sb_remount: * Extracts security system specific mount options and verifies no changes * are being made to those options. - * @sb superblock being remounted + * @sb superblock being remounted. * @data contains the filesystem-specific data. * Return 0 if permission is granted. * @sb_kern_mount: - * Mount this @sb if allowed by permissions. + * Mount this @sb if allowed by permissions. + * Return 0 if permission is granted. * @sb_show_options: * Show (print on @m) mount options for this @sb. + * Return 0 on success, negative values on failure. * @sb_umount: * Check permission before the @mnt file system is unmounted. * @mnt contains the mounted file system. @@ -165,27 +172,31 @@ * Return 0 if permission is granted. * @sb_set_mnt_opts: * Set the security relevant mount options used for a superblock - * @sb the superblock to set security mount options for - * @opts binary data structure containing all lsm mount data + * @sb the superblock to set security mount options for. + * @opts binary data structure containing all lsm mount data. + * Return 0 on success, error on failure. * @sb_clone_mnt_opts: * Copy all security options from a given superblock to another - * @oldsb old superblock which contain information to clone - * @newsb new superblock which needs filled in + * @oldsb old superblock which contain information to clone. + * @newsb new superblock which needs filled in. + * Return 0 on success, error on failure. * @move_mount: * Check permission before a mount is moved. * @from_path indicates the mount that is going to be moved. * @to_path indicates the mountpoint that will be mounted upon. + * Return 0 if permission is granted. * @dentry_init_security: * Compute a context for a dentry as the inode is not yet available * since NFSv4 has no label backed by an EA anyway. * @dentry dentry to use in calculating the context. * @mode mode used to determine resource type. - * @name name of the last path component used to create file + * @name name of the last path component used to create file. * @xattr_name pointer to place the pointer to security xattr name. * Caller does not have to free the resulting pointer. Its * a pointer to static string. * @ctx pointer to place the pointer to the resulting context in. * @ctxlen point to place the length of the resulting context. + * Return 0 on success, negative values on failure. * @dentry_create_files_as: * Compute a context for a dentry as the inode is not yet available * and set that context in passed in creds so that new files are @@ -193,9 +204,10 @@ * passed in creds and not the creds of the caller. * @dentry dentry to use in calculating the context. * @mode mode used to determine resource type. - * @name name of the last path component used to create file - * @old creds which should be used for context calculation - * @new creds to modify + * @name name of the last path component used to create file. + * @old creds which should be used for context calculation. + * @new creds to modify. + * Return 0 on success, error on failure. * * * Security hooks for inode operations. @@ -223,7 +235,7 @@ * then it should return -EOPNOTSUPP to skip this processing. * @inode contains the inode structure of the newly created inode. * @dir contains the inode structure of the parent directory. - * @qstr contains the last path component of the new object + * @qstr contains the last path component of the new object. * @name will be set to the allocated name suffix (e.g. selinux). * @value will be set to the allocated attribute value. * @len will be set to the length of the value. @@ -234,9 +246,9 @@ * Set up the incore security field for the new anonymous inode * and return whether the inode creation is permitted by the security * module or not. - * @inode contains the inode structure - * @name name of the anonymous inode class - * @context_inode optional related inode + * @inode contains the inode structure. + * @name name of the anonymous inode class. + * @context_inode optional related inode. * Returns 0 on success, -EACCES if the security module denies the * creation of this inode, or another -errno upon other errors. * @inode_create: @@ -352,7 +364,7 @@ * mode is specified in @mode. * @path contains the path structure of the file to change the mode. * @mode contains the new DAC's permission, which is a bitmask of - * constants from + * constants from . * Return 0 if permission is granted. * @path_chown: * Check for permission to change owner/group of a file or directory. @@ -367,6 +379,7 @@ * @path_notify: * Check permissions before setting a watch on events as defined by @mask, * on an object at @path, whose type is defined by @obj_type. + * Return 0 if permission is granted. * @inode_readlink: * Check the permission to read the symbolic link. * @dentry contains the dentry structure for the file link. @@ -374,7 +387,7 @@ * @inode_follow_link: * Check permission to follow a symbolic link when looking up a pathname. * @dentry contains the dentry structure for the link. - * @inode contains the inode, which itself is not stable in RCU-walk + * @inode contains the inode, which itself is not stable in RCU-walk. * @rcu indicates whether we are in RCU-walk mode. * Return 0 if permission is granted. * @inode_permission: @@ -426,9 +439,9 @@ * Retrieve a copy of the extended attribute representation of the * security label associated with @name for @inode via @buffer. Note that * @name is the remainder of the attribute name after the security prefix - * has been removed. @alloc is used to specify of the call should return a - * value via the buffer or just the value length Return size of buffer on - * success. + * has been removed. @alloc is used to specify if the call should return a + * value via the buffer or just the value length. + * Return size of buffer on success. * @inode_setsecurity: * Set the security label associated with @name for @inode from the * extended attribute value @value. @size indicates the size of the @@ -451,7 +464,7 @@ * @inode_killpriv: * The setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. - * @mnt_userns: user namespace of the mount + * @mnt_userns: user namespace of the mount. * @dentry is the dentry being changed. * Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. @@ -478,20 +491,22 @@ * to abort the copy up. Note that the caller is responsible for reading * and writing the xattrs as this hook is merely a filter. * @d_instantiate: - * Fill in @inode security information for a @dentry if allowed. + * Fill in @inode security information for a @dentry if allowed. * @getprocattr: - * Read attribute @name for process @p and store it into @value if allowed. + * Read attribute @name for process @p and store it into @value if allowed. + * Return the length of @value on success, a negative value otherwise. * @setprocattr: - * Write (set) attribute @name to @value, size @size if allowed. + * Write (set) attribute @name to @value, size @size if allowed. + * Return written bytes on success, a negative value otherwise. * * Security hooks for kernfs node operations * * @kernfs_init_security: * Initialize the security context of a newly created kernfs node based * on its own and its parent's attributes. - * - * @kn_dir the parent kernfs node - * @kn the new child kernfs node + * @kn_dir the parent kernfs node. + * @kn the new child kernfs node. + * Return 0 if permission is granted. * * Security hooks for file operations * @@ -530,11 +545,11 @@ * simple integer value. When @arg represents a user space pointer, it * should never be used by the security module. * Return 0 if permission is granted. - * @mmap_addr : + * @mmap_addr: * Check permissions for a mmap operation at @addr. * @addr contains virtual address that will be used for the operation. * Return 0 if permission is granted. - * @mmap_file : + * @mmap_file: * Check permissions for a mmap operation. The @file may be NULL, e.g. * if mapping anonymous memory. * @file contains the file structure for file to map (may be NULL). @@ -589,6 +604,7 @@ * Save open-time permission checking state for later use upon * file_permission, and recheck access if anything has changed * since inode_permission. + * Return 0 if permission is granted. * * Security hooks for task operations. * @@ -606,6 +622,7 @@ * @gfp indicates the atomicity of any memory allocations. * Only allocate sufficient memory and attach to @cred such that * cred_transfer() will not get ENOMEM. + * Return 0 on success, negative values on failure. * @cred_free: * @cred points to the credentials. * Deallocate and clear the cred->security field in a set of credentials. @@ -614,6 +631,7 @@ * @old points to the original credentials. * @gfp indicates the atomicity of any memory allocations. * Prepare a new set of credentials by copying the data from the old set. + * Return 0 on success, negative values on failure. * @cred_transfer: * @new points to the new credentials. * @old points to the original credentials. @@ -625,7 +643,7 @@ * @kernel_act_as: * Set the credentials for a kernel service to act as (subjective context). * @new points to the credentials to be modified. - * @secid specifies the security ID to be set + * @secid specifies the security ID to be set. * The current task must be the one that nominated @secid. * Return 0 if successful. * @kernel_create_files_as: @@ -638,19 +656,19 @@ * @kernel_module_request: * Ability to trigger the kernel to automatically upcall to userspace for * userspace to load a kernel module with the given name. - * @kmod_name name of the module requested by the kernel + * @kmod_name name of the module requested by the kernel. * Return 0 if successful. * @kernel_load_data: * Load data provided by userspace. - * @id kernel load data identifier + * @id kernel load data identifier. * @contents if a subsequent @kernel_post_load_data will be called. * Return 0 if permission is granted. * @kernel_post_load_data: * Load data provided by a non-file source (usually userspace buffer). * @buf pointer to buffer containing the data contents. * @size length of the data contents. - * @id kernel load data identifier - * @description a text description of what was loaded, @id-specific + * @id kernel load data identifier. + * @description a text description of what was loaded, @id-specific. * Return 0 if permission is granted. * This must be paired with a prior @kernel_load_data call that had * @contents set to true. @@ -658,7 +676,7 @@ * Read a file specified by userspace. * @file contains the file structure pointing to the file being read * by the kernel. - * @id kernel read file identifier + * @id kernel read file identifier. * @contents if a subsequent @kernel_post_read_file will be called. * Return 0 if permission is granted. * @kernel_post_read_file: @@ -667,7 +685,7 @@ * by the kernel. * @buf pointer to buffer containing the file contents. * @size length of the file contents. - * @id kernel read file identifier + * @id kernel read file identifier. * This must be paired with a prior @kernel_read_file call that had * @contents set to true. * Return 0 if permission is granted. @@ -677,7 +695,7 @@ * indicates which of the set*uid system calls invoked this hook. If * @new is the set of credentials that will be installed. Modifications * should be made to this rather than to @current->cred. - * @old is the set of credentials that are being replaces + * @old is the set of credentials that are being replaced. * @flags contains one of the LSM_SETID_* values. * Return 0 on success. * @task_fix_setgid: @@ -729,7 +747,7 @@ * @task_setioprio: * Check permission before setting the ioprio value of @p to @ioprio. * @p contains the task_struct of process. - * @ioprio contains the new ioprio value + * @ioprio contains the new ioprio value. * Return 0 if permission is granted. * @task_getioprio: * Check permission before getting the ioprio value of @p. @@ -860,6 +878,7 @@ * @type contains the requested communications type. * @protocol contains the requested protocol. * @kern set to 1 if a kernel socket. + * Return 0 if permission is granted. * @socket_socketpair: * Check permissions before creating a fresh pair of sockets. * @socka contains the first socket structure. @@ -943,6 +962,7 @@ * Must not sleep inside this hook because some callers hold spinlocks. * @sk contains the sock (not socket) associated with the incoming sk_buff. * @skb contains the incoming network data. + * Return 0 if permission is granted. * @socket_getpeersec_stream: * This hook allows the security module to provide peer socket security * state for unix or connected tcp sockets to userspace via getsockopt @@ -970,6 +990,7 @@ * @sk_alloc_security: * Allocate and attach a security structure to the sk->sk_security field, * which is used to copy security attributes between local stream sockets. + * Return 0 on success, error on failure. * @sk_free_security: * Deallocate security structure. * @sk_clone_security: @@ -982,17 +1003,19 @@ * @inet_conn_request: * Sets the openreq's sid to socket's sid with MLS portion taken * from peer sid. + * Return 0 if permission is granted. * @inet_csk_clone: * Sets the new child socket's sid to the openreq sid. * @inet_conn_established: * Sets the connection's peersid to the secmark on skb. * @secmark_relabel_packet: - * check if the process should be allowed to relabel packets to - * the given secid + * Check if the process should be allowed to relabel packets to + * the given secid. + * Return 0 if permission is granted. * @secmark_refcount_inc: - * tells the LSM to increment the number of secmark labeling rules loaded + * Tells the LSM to increment the number of secmark labeling rules loaded. * @secmark_refcount_dec: - * tells the LSM to decrement the number of secmark labeling rules loaded + * Tells the LSM to decrement the number of secmark labeling rules loaded. * @req_classify_flow: * Sets the flow's sid to the openreq sid. * @tun_dev_alloc_security: @@ -1003,21 +1026,25 @@ * @tun_dev_free_security: * This hook allows a module to free the security structure for a TUN * device. - * @security pointer to the TUN device's security structure + * @security pointer to the TUN device's security structure. * @tun_dev_create: * Check permissions prior to creating a new TUN device. + * Return 0 if permission is granted. * @tun_dev_attach_queue: * Check permissions prior to attaching to a TUN device queue. * @security pointer to the TUN device's security structure. + * Return 0 if permission is granted. * @tun_dev_attach: * This hook can be used by the module to update any security state * associated with the TUN device's sock structure. * @sk contains the existing sock structure. * @security pointer to the TUN device's security structure. + * Return 0 if permission is granted. * @tun_dev_open: * This hook can be used by the module to update any security state * associated with the TUN device's security structure. * @security pointer to the TUN devices's security structure. + * Return 0 if permission is granted. * * Security hooks for SCTP * @@ -1050,6 +1077,7 @@ * to the security module. * @asoc pointer to sctp association structure. * @skb pointer to skbuff of association packet. + * Return 0 if permission is granted. * * Security hooks for Infiniband * @@ -1058,15 +1086,17 @@ * @subnet_prefix the subnet prefix of the port being used. * @pkey the pkey to be accessed. * @sec pointer to a security structure. + * Return 0 if permission is granted. * @ib_endport_manage_subnet: * Check permissions to send and receive SMPs on a end port. * @dev_name the IB device name (i.e. mlx4_0). * @port_num the port number. * @sec pointer to a security structure. + * Return 0 if permission is granted. * @ib_alloc_security: * Allocate a security structure for Infiniband objects. * @sec pointer to a security structure pointer. - * Returns 0 on success, non-zero on failure + * Returns 0 on success, non-zero on failure. * @ib_free_security: * Deallocate an Infiniband security structure. * @sec contains the security structure to be freed. @@ -1078,10 +1108,11 @@ * Database used by the XFRM system. * @sec_ctx contains the security context information being provided by * the user-level policy update program (e.g., setkey). + * @gfp is to specify the context for the allocation. * Allocate a security structure to the xp->security field; the security * field is initialized to NULL when the xfrm_policy is allocated. - * Return 0 if operation was successful (memory to allocate, legal context) - * @gfp is to specify the context for the allocation + * Return 0 if operation was successful (memory to allocate, legal + * context). * @xfrm_policy_clone_security: * @old_ctx contains an existing xfrm_sec_ctx. * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. @@ -1089,11 +1120,12 @@ * information from the old_ctx structure. * Return 0 if operation was successful (memory to allocate). * @xfrm_policy_free_security: - * @ctx contains the xfrm_sec_ctx + * @ctx contains the xfrm_sec_ctx. * Deallocate xp->security. * @xfrm_policy_delete_security: * @ctx contains the xfrm_sec_ctx. * Authorize deletion of xp->security. + * Return 0 if permission is granted. * @xfrm_state_alloc: * @x contains the xfrm_state being added to the Security Association * Database by the XFRM system. @@ -1119,6 +1151,7 @@ * @xfrm_state_delete_security: * @x contains the xfrm_state. * Authorize deletion of x->security. + * Return 0 if permission is granted. * @xfrm_policy_lookup: * @ctx contains the xfrm_sec_ctx for which the access control is being * checked. @@ -1147,7 +1180,7 @@ * Permit allocation of a key and assign security data. Note that key does * not have a serial number assigned at this point. * @key points to the key. - * @flags is the allocation flags + * @flags is the allocation flags. * Return 0 if permission is granted, -ve error otherwise. * @key_free: * Notification of destruction; free security data. @@ -1177,8 +1210,8 @@ * * @ipc_permission: * Check permissions for access to IPC - * @ipcp contains the kernel IPC permission structure - * @flag contains the desired (requested) permission set + * @ipcp contains the kernel IPC permission structure. + * @flag contains the desired (requested) permission set. * Return 0 if permission is granted. * @ipc_getsecid: * Get the secid associated with the ipc object. @@ -1323,15 +1356,18 @@ * to @to. * @from contains the struct cred for the sending process. * @to contains the struct cred for the receiving process. + * Return 0 if permission is granted. * @binder_transfer_binder: * Check whether @from is allowed to transfer a binder reference to @to. * @from contains the struct cred for the sending process. * @to contains the struct cred for the receiving process. + * Return 0 if permission is granted. * @binder_transfer_file: * Check whether @from is allowed to transfer @file to @to. * @from contains the struct cred for the sending process. * @file contains the struct file being transferred. * @to contains the struct cred for the receiving process. + * Return 0 if permission is granted. * * @ptrace_access_check: * Check permission before allowing the current process to trace the @@ -1373,26 +1409,29 @@ * Check whether the @tsk process has the @cap capability in the indicated * credentials. * @cred contains the credentials to use. - * @ns contains the user namespace we want the capability in + * @ns contains the user namespace we want the capability in. * @cap contains the capability . - * @opts contains options for the capable check + * @opts contains options for the capable check . * Return 0 if the capability is granted for @tsk. * @quotactl: - * Check whether the quotactl syscall is allowed for this @sb. + * Check whether the quotactl syscall is allowed for this @sb. + * Return 0 if permission is granted. * @quota_on: - * Check whether QUOTAON is allowed for this @dentry. + * Check whether QUOTAON is allowed for this @dentry. + * Return 0 if permission is granted. * @syslog: * Check permission before accessing the kernel message ring or changing * logging to the console. * See the syslog(2) manual page for an explanation of the @type values. - * @type contains the SYSLOG_ACTION_* constant from + * @type contains the SYSLOG_ACTION_* constant from + * . * Return 0 if permission is granted. * @settime: * Check permission to change the system time. * struct timespec64 is defined in and timezone * is defined in - * @ts contains new time - * @tz contains new timezone + * @ts contains new time. + * @tz contains new timezone. * Return 0 if permission is granted. * @vm_enough_memory: * Check permissions for allocating a new virtual mapping. @@ -1420,11 +1459,13 @@ * @secid contains the security ID. * @secdata contains the pointer that stores the converted security * context. - * @seclen pointer which contains the length of the data + * @seclen pointer which contains the length of the data. + * Return 0 on success, error on failure. * @secctx_to_secid: * Convert security context to secid. * @secid contains the pointer to the generated security ID. * @secdata contains the security context. + * Return 0 on success, error on failure. * * @release_secctx: * Release the security context. @@ -1461,7 +1502,7 @@ * @audit_rule_free: * Deallocate the LSM audit rule structure previously allocated by * audit_rule_init. - * @lsmrule contains the allocated rule + * @lsmrule contains the allocated rule. * * @inode_invalidate_secctx: * Notify the security module that it must revalidate the security context @@ -1478,6 +1519,7 @@ * @inode we wish to set the security context of. * @ctx contains the string which we wish to set in the inode. * @ctxlen contains the length of @ctx. + * Return 0 on success, error on failure. * * @inode_setsecctx: * Change the security context of an inode. Updates the @@ -1491,6 +1533,7 @@ * @dentry contains the inode we wish to set the security context of. * @ctx contains the string which we wish to set in the inode. * @ctxlen contains the length of @ctx. + * Return 0 on success, error on failure. * * @inode_getsecctx: * On success, returns 0 and fills out @ctx and @ctxlen with the security @@ -1498,6 +1541,7 @@ * @inode we wish to get the security context of. * @ctx is a pointer in which to place the allocated security context. * @ctxlen points to the place to put the length of @ctx. + * Return 0 on success, error on failure. * * Security hooks for the general notification queue: * @@ -1505,13 +1549,15 @@ * Check to see if a watch notification can be posted to a particular * queue. * @w_cred: The credentials of the whoever set the watch. - * @cred: The event-triggerer's credentials - * @n: The notification being posted + * @cred: The event-triggerer's credentials. + * @n: The notification being posted. + * Return 0 if permission is granted. * * @watch_key: * Check to see if a process is allowed to watch for event notifications * from a key or keyring. * @key: The key to watch. + * Return 0 if permission is granted. * * Security hooks for using the eBPF maps and programs functionalities through * eBPF syscalls. @@ -1520,65 +1566,74 @@ * Do a initial check for all bpf syscalls after the attribute is copied * into the kernel. The actual security module can implement their own * rules to check the specific cmd they need. + * Return 0 if permission is granted. * * @bpf_map: * Do a check when the kernel generate and return a file descriptor for * eBPF maps. - * - * @map: bpf map that we want to access - * @mask: the access flags + * @map: bpf map that we want to access. + * @mask: the access flags. + * Return 0 if permission is granted. * * @bpf_prog: * Do a check when the kernel generate and return a file descriptor for * eBPF programs. - * * @prog: bpf prog that userspace want to use. + * Return 0 if permission is granted. * * @bpf_map_alloc_security: * Initialize the security field inside bpf map. + * Return 0 on success, error on failure. * * @bpf_map_free_security: * Clean up the security information stored inside bpf map. * * @bpf_prog_alloc_security: * Initialize the security field inside bpf program. + * Return 0 on success, error on failure. * * @bpf_prog_free_security: * Clean up the security information stored inside bpf prog. * * @locked_down: - * Determine whether a kernel feature that potentially enables arbitrary - * code execution in kernel space should be permitted. - * - * @what: kernel feature being accessed + * Determine whether a kernel feature that potentially enables arbitrary + * code execution in kernel space should be permitted. + * @what: kernel feature being accessed. + * Return 0 if permission is granted. * * Security hooks for perf events * * @perf_event_open: - * Check whether the @type of perf_event_open syscall is allowed. + * Check whether the @type of perf_event_open syscall is allowed. + * Return 0 if permission is granted. * @perf_event_alloc: - * Allocate and save perf_event security info. + * Allocate and save perf_event security info. + * Return 0 on success, error on failure. * @perf_event_free: - * Release (free) perf_event security info. + * Release (free) perf_event security info. * @perf_event_read: - * Read perf_event security info if allowed. + * Read perf_event security info if allowed. + * Return 0 if permission is granted. * @perf_event_write: - * Write perf_event security info if allowed. + * Write perf_event security info if allowed. + * Return 0 if permission is granted. * * Security hooks for io_uring * * @uring_override_creds: - * Check if the current task, executing an io_uring operation, is allowed - * to override it's credentials with @new. - * - * @new: the new creds to use + * Check if the current task, executing an io_uring operation, is allowed + * to override it's credentials with @new. + * @new: the new creds to use. + * Return 0 if permission is granted. * * @uring_sqpoll: - * Check whether the current task is allowed to spawn a io_uring polling - * thread (IORING_SETUP_SQPOLL). + * Check whether the current task is allowed to spawn a io_uring polling + * thread (IORING_SETUP_SQPOLL). + * Return 0 if permission is granted. * * @uring_cmd: - * Check whether the file_operations uring_cmd is allowed to run. + * Check whether the file_operations uring_cmd is allowed to run. + * Return 0 if permission is granted. * */ union security_list_options { -- cgit From 8669247524c73e16e4d3384c4ff882e5c5d06194 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 24 Nov 2022 11:42:11 +0800 Subject: fscache,cachefiles: add prepare_ondemand_read() callback Add prepare_ondemand_read() callback dedicated for the on-demand read scenario, so that callers from this scenario can be decoupled from netfs_io_subrequest. The original cachefiles_prepare_read() is now refactored to a generic routine accepting a parameter list instead of netfs_io_subrequest. There's no logic change, except that the debug id of subrequest and request is removed from trace_cachefiles_prep_read(). Reviewed-by: Jeff Layton Signed-off-by: Jingbo Xu Acked-by: David Howells Link: https://lore.kernel.org/r/20221124034212.81892-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- include/linux/netfs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f2402ddeafbf..4c76ddfb6a67 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -267,6 +267,14 @@ struct netfs_cache_ops { loff_t *_start, size_t *_len, loff_t i_size, bool no_space_allocated_yet); + /* Prepare an on-demand read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ + enum netfs_io_source (*prepare_ondemand_read)(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, + loff_t i_size, + unsigned long *_flags, ino_t ino); + /* Query the occupancy of the cache in a region, returning where the * next chunk of data starts and how long it is. */ -- cgit From bffdeaa8a5af7200b0e74c9d5a41167f86626a36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:43 -0800 Subject: bpf: decouple prune and jump points BPF verifier marks some instructions as prune points. Currently these prune points serve two purposes. It's a point where verifier tries to find previously verified state and check current state's equivalence to short circuit verification for current code path. But also currently it's a point where jump history, used for precision backtracking, is updated. This is done so that non-linear flow of execution could be properly backtracked. Such coupling is coincidental and unnecessary. Some prune points are not part of some non-linear jump path, so don't need update of jump history. On the other hand, not all instructions which have to be recorded in jump history necessarily are good prune points. This patch splits prune and jump points into independent flags. Currently all prune points are marked as jump points to minimize amount of changes in this patch, but next patch will perform some optimization of prune vs jmp point placement. No functional changes are intended. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c07b351a5bc7..70d06a99f0b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -452,6 +452,7 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool prune_point; + bool jmp_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit From ed883bec679b027b198d57a336715f8298fb88b4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 5 Dec 2022 12:34:42 +0100 Subject: net: ethernet: mtk_wed: add reset to rx_ring_setup callback This patch adds reset parameter to mtk_wed_rx_ring_setup signature in order to align rx_ring_setup callback to tx_ring_setup one introduced in 'commit 23dca7a90017 ("net: ethernet: mtk_wed: add reset to tx_ring_setup callback")' Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/29c6e7a5469e784406cf3e2920351d1207713d05.1670239984.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index beb190449704..a0746d4aec20 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -160,7 +160,7 @@ struct mtk_wed_ops { int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, - void __iomem *regs); + void __iomem *regs, bool reset); int (*txfree_ring_setup)(struct mtk_wed_device *dev, void __iomem *regs); int (*msg_update)(struct mtk_wed_device *dev, int cmd_id, @@ -228,8 +228,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->irq_get(_dev, _mask) #define mtk_wed_device_irq_set_mask(_dev, _mask) \ (_dev)->ops->irq_set_mask(_dev, _mask) -#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \ - (_dev)->ops->rx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) \ + (_dev)->ops->rx_ring_setup(_dev, _ring, _regs, _reset) #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \ (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ @@ -249,7 +249,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) #define mtk_wed_device_irq_get(_dev, _mask) 0 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) -#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #define mtk_wed_device_stop(_dev) do {} while (0) -- cgit From 43c487c28f9c353a9690f975e7b7dc56be144427 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 2 Nov 2022 13:54:30 +0100 Subject: mmc: tmio: remove 'alignment_shift' from platform data There is only one alignment shift for one type of Renesas SDHI. Encode it directly in its DMA driver to reduce complexity and ease further simplifications. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20221102125430.28466-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mfd/tmio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 27264fe4b3b9..e8bf90281ba0 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -102,7 +102,6 @@ struct tmio_mmc_data { unsigned long capabilities2; unsigned long flags; u32 ocr_mask; /* available voltages */ - int alignment_shift; dma_addr_t dma_rx_offset; unsigned int max_blk_count; unsigned short max_segs; -- cgit From 495b637f640b7f87ab42f3cc829e35ea76a1f3eb Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 6 Dec 2022 17:59:40 +0100 Subject: iommu: Add note about struct iommu_fwspec usage This structure is to be considered private to the IOMMU API. Except for very few exceptions, IOMMU consumer drivers should treat this as opaque data. Acked-by: Joerg Roedel Signed-off-by: Thierry Reding Link: https://lore.kernel.org/r/20221206165945.3551774-2-thierry.reding@gmail.com Signed-off-by: Ulf Hansson --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e..20b592dec335 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -605,6 +605,10 @@ struct iommu_group *fsl_mc_device_group(struct device *dev); * @flags: IOMMU_FWSPEC_* flags * @num_ids: number of associated device IDs * @ids: IDs which this device may present to the IOMMU + * + * Note that the IDs (and any other information, really) stored in this structure should be + * considered private to the IOMMU device driver and are not to be used directly by IOMMU + * consumers. */ struct iommu_fwspec { const struct iommu_ops *ops; -- cgit From 493c9b68d1d8765b2f8740a06c169e90a947159f Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 6 Dec 2022 17:59:41 +0100 Subject: iommu/tegra: Add tegra_dev_iommu_get_stream_id() helper Access to the internals of struct iommu_fwspec by non-IOMMU drivers is discouraged. Many drivers for Tegra SoCs, however, need access to their IOMMU stream IDs so that they can be programmed into various hardware registers. Formalize this access into a common helper to make it easier to audit and maintain. Acked-by: Robin Murphy Signed-off-by: Thierry Reding Link: https://lore.kernel.org/r/20221206165945.3551774-3-thierry.reding@gmail.com Signed-off-by: Ulf Hansson --- include/linux/iommu.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 20b592dec335..6f53ad74fa0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1103,4 +1103,25 @@ static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_m #endif /* CONFIG_IOMMU_DMA */ +/* + * Newer generations of Tegra SoCs require devices' stream IDs to be directly programmed into + * some registers. These are always paired with a Tegra SMMU or ARM SMMU, for which the contents + * of the struct iommu_fwspec are known. Use this helper to formalize access to these internals. + */ +#define TEGRA_STREAM_ID_BYPASS 0x7f + +static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream_id) +{ +#ifdef CONFIG_IOMMU_API + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + if (fwspec && fwspec->num_ids == 1) { + *stream_id = fwspec->ids[0] & 0xffff; + return true; + } +#endif + + return false; +} + #endif /* __LINUX_IOMMU_H */ -- cgit From 7ca91a33775c4a33cb451f508f84a7820179c73b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 25 Sep 2022 22:44:19 -0700 Subject: mfd: palmas: Stop including of_gpio.h It does not appear that any of palmas sub-drivers are using OF-based gpio APIs, so let's stop including this header. Signed-off-by: Dmitry Torokhov Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220926054421.1546436-3-dmitry.torokhov@gmail.com --- include/linux/mfd/palmas.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h index 1e61c7e9f50d..117d02708439 100644 --- a/include/linux/mfd/palmas.h +++ b/include/linux/mfd/palmas.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #define PALMAS_NUM_CLIENTS 3 -- cgit From 3c92699a167a543b2bc7d603e4a09aa78a99f809 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 25 Sep 2022 22:44:20 -0700 Subject: mfd: twl6040: Switch to using gpiod API This patch switches the dirver from legacy gpio API to a newer gpiod API so that we can eventually drop the former. Signed-off-by: Dmitry Torokhov Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220926054421.1546436-4-dmitry.torokhov@gmail.com --- include/linux/mfd/twl6040.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h index 1fc7450bd8ab..cb1e7a9ce317 100644 --- a/include/linux/mfd/twl6040.h +++ b/include/linux/mfd/twl6040.h @@ -196,13 +196,14 @@ struct twl6040_gpo_data { }; struct twl6040_platform_data { - int audpwron_gpio; /* audio power-on gpio */ + struct gpio_desc *audpwron_gpio; /* audio power-on gpio */ struct twl6040_codec_data *codec; struct twl6040_vibra_data *vibra; struct twl6040_gpo_data *gpo; }; +struct gpio_desc; struct regmap; struct regmap_irq_chips_data; @@ -218,7 +219,7 @@ struct twl6040 { struct mfd_cell cells[TWL6040_CELLS]; struct completion ready; - int audpwron; + struct gpio_desc *audpwron; int power_count; int rev; -- cgit From 1f7caaa1743edbc40f3cd7e2bc0dff89698fd91d Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 25 Sep 2022 22:44:21 -0700 Subject: mfd: twl6040: Drop twl6040_platform_data and associated definitions As of df04b6242a58 ("mfd: twl6040: Remove support for legacy (pdata) mode") the driver no longer references the platform data, so we can drop its definition, as well as definitions of related structures. Signed-off-by: Dmitry Torokhov Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220926054421.1546436-5-dmitry.torokhov@gmail.com --- include/linux/mfd/twl6040.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h index cb1e7a9ce317..286a724e379a 100644 --- a/include/linux/mfd/twl6040.h +++ b/include/linux/mfd/twl6040.h @@ -174,35 +174,6 @@ #define TWL6040_GPO_MAX 3 -/* TODO: All platform data struct can be removed */ -struct twl6040_codec_data { - u16 hs_left_step; - u16 hs_right_step; - u16 hf_left_step; - u16 hf_right_step; -}; - -struct twl6040_vibra_data { - unsigned int vibldrv_res; /* left driver resistance */ - unsigned int vibrdrv_res; /* right driver resistance */ - unsigned int viblmotor_res; /* left motor resistance */ - unsigned int vibrmotor_res; /* right motor resistance */ - int vddvibl_uV; /* VDDVIBL volt, set 0 for fixed reg */ - int vddvibr_uV; /* VDDVIBR volt, set 0 for fixed reg */ -}; - -struct twl6040_gpo_data { - int gpio_base; -}; - -struct twl6040_platform_data { - struct gpio_desc *audpwron_gpio; /* audio power-on gpio */ - - struct twl6040_codec_data *codec; - struct twl6040_vibra_data *vibra; - struct twl6040_gpo_data *gpo; -}; - struct gpio_desc; struct regmap; struct regmap_irq_chips_data; -- cgit From fc45720334ebc9f97335a9ce67896811354f0a1c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Oct 2022 17:29:32 +0200 Subject: mfd: Remove dm355evm_msp driver The DaVinci DM355EVM platform is gone after the removal of all unused board files, so the MTD device along with its sub-devices can be removed as well. Signed-off-by: Arnd Bergmann Acked-by: Bartosz Golaszewski Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20221019152947.3857217-7-arnd@kernel.org --- include/linux/mfd/dm355evm_msp.h | 79 ---------------------------------------- 1 file changed, 79 deletions(-) delete mode 100644 include/linux/mfd/dm355evm_msp.h (limited to 'include/linux') diff --git a/include/linux/mfd/dm355evm_msp.h b/include/linux/mfd/dm355evm_msp.h deleted file mode 100644 index 372470350fab..000000000000 --- a/include/linux/mfd/dm355evm_msp.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * dm355evm_msp.h - support MSP430 microcontroller on DM355EVM board - */ -#ifndef __LINUX_I2C_DM355EVM_MSP -#define __LINUX_I2C_DM355EVM_MSP - -/* - * Written against Spectrum's writeup for the A4 firmware revision, - * and tweaked to match source and rev D2 schematics by removing CPLD - * and NOR flash hooks (which were last appropriate in rev B boards). - * - * Note that the firmware supports a flavor of write posting ... to be - * sure a write completes, issue another read or write. - */ - -/* utilities to access "registers" emulated by msp430 firmware */ -extern int dm355evm_msp_write(u8 value, u8 reg); -extern int dm355evm_msp_read(u8 reg); - - -/* command/control registers */ -#define DM355EVM_MSP_COMMAND 0x00 -# define MSP_COMMAND_NULL 0 -# define MSP_COMMAND_RESET_COLD 1 -# define MSP_COMMAND_RESET_WARM 2 -# define MSP_COMMAND_RESET_WARM_I 3 -# define MSP_COMMAND_POWEROFF 4 -# define MSP_COMMAND_IR_REINIT 5 -#define DM355EVM_MSP_STATUS 0x01 -# define MSP_STATUS_BAD_OFFSET BIT(0) -# define MSP_STATUS_BAD_COMMAND BIT(1) -# define MSP_STATUS_POWER_ERROR BIT(2) -# define MSP_STATUS_RXBUF_OVERRUN BIT(3) -#define DM355EVM_MSP_RESET 0x02 /* 0 bits == in reset */ -# define MSP_RESET_DC5 BIT(0) -# define MSP_RESET_TVP5154 BIT(2) -# define MSP_RESET_IMAGER BIT(3) -# define MSP_RESET_ETHERNET BIT(4) -# define MSP_RESET_SYS BIT(5) -# define MSP_RESET_AIC33 BIT(7) - -/* GPIO registers ... bit patterns mostly match the source MSP ports */ -#define DM355EVM_MSP_LED 0x03 /* active low (MSP P4) */ -#define DM355EVM_MSP_SWITCH1 0x04 /* (MSP P5, masked) */ -# define MSP_SWITCH1_SW6_1 BIT(0) -# define MSP_SWITCH1_SW6_2 BIT(1) -# define MSP_SWITCH1_SW6_3 BIT(2) -# define MSP_SWITCH1_SW6_4 BIT(3) -# define MSP_SWITCH1_J1 BIT(4) /* NTSC/PAL */ -# define MSP_SWITCH1_MSP_INT BIT(5) /* active low */ -#define DM355EVM_MSP_SWITCH2 0x05 /* (MSP P6, masked) */ -# define MSP_SWITCH2_SW10 BIT(3) -# define MSP_SWITCH2_SW11 BIT(4) -# define MSP_SWITCH2_SW12 BIT(5) -# define MSP_SWITCH2_SW13 BIT(6) -# define MSP_SWITCH2_SW14 BIT(7) -#define DM355EVM_MSP_SDMMC 0x06 /* (MSP P2, masked) */ -# define MSP_SDMMC_0_WP BIT(1) -# define MSP_SDMMC_0_CD BIT(2) /* active low */ -# define MSP_SDMMC_1_WP BIT(3) -# define MSP_SDMMC_1_CD BIT(4) /* active low */ -#define DM355EVM_MSP_FIRMREV 0x07 /* not a GPIO (out of order) */ -#define DM355EVM_MSP_VIDEO_IN 0x08 /* (MSP P3, masked) */ -# define MSP_VIDEO_IMAGER BIT(7) /* low == tvp5146 */ - -/* power supply registers are currently omitted */ - -/* RTC registers */ -#define DM355EVM_MSP_RTC_0 0x12 /* LSB */ -#define DM355EVM_MSP_RTC_1 0x13 -#define DM355EVM_MSP_RTC_2 0x14 -#define DM355EVM_MSP_RTC_3 0x15 /* MSB */ - -/* input event queue registers; code == ((HIGH << 8) | LOW) */ -#define DM355EVM_MSP_INPUT_COUNT 0x16 /* decrement by reading LOW */ -#define DM355EVM_MSP_INPUT_HIGH 0x17 -#define DM355EVM_MSP_INPUT_LOW 0x18 - -#endif /* __LINUX_I2C_DM355EVM_MSP */ -- cgit From 707857d997ae39743eba939a5b3aaafbab04fa78 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Oct 2022 17:03:39 +0200 Subject: mfd: Remove htc-i2cpld driver The HTC Herald machine was removed, so this driver is no longer used anywhere. Cc: Cory Maccarrone Signed-off-by: Arnd Bergmann Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20221019150410.3851944-17-arnd@kernel.org --- include/linux/htcpld.h | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 include/linux/htcpld.h (limited to 'include/linux') diff --git a/include/linux/htcpld.h b/include/linux/htcpld.h deleted file mode 100644 index 5f8ac9b1d724..000000000000 --- a/include/linux/htcpld.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_HTCPLD_H -#define __LINUX_HTCPLD_H - -struct htcpld_chip_platform_data { - unsigned int addr; - unsigned int reset; - unsigned int num_gpios; - unsigned int gpio_out_base; - unsigned int gpio_in_base; - unsigned int irq_base; - unsigned int num_irqs; -}; - -struct htcpld_core_platform_data { - unsigned int i2c_adapter_id; - - struct htcpld_chip_platform_data *chip; - unsigned int num_chip; -}; - -#endif /* __LINUX_HTCPLD_H */ - -- cgit From 245cb473e5388fcbc01c7284b6a4e1446cdbf054 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 23 Oct 2022 10:48:30 +0100 Subject: mfd: pcf50633: Remove #ifdef guards for PM related functions Use the new EXPORT_GPL_SIMPLE_DEV_PM_OPS() and pm_sleep_ptr() macros to handle the .suspend/.resume callbacks. These macros allow the suspend and resume functions to be automatically dropped by the compiler when CONFIG_SUSPEND is disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Paul Cercueil Signed-off-by: Lee Jones --- include/linux/mfd/pcf50633/core.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h index 3f752dc62a6c..539f27f8bd89 100644 --- a/include/linux/mfd/pcf50633/core.h +++ b/include/linux/mfd/pcf50633/core.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -226,9 +227,6 @@ static inline struct pcf50633 *dev_to_pcf50633(struct device *dev) int pcf50633_irq_init(struct pcf50633 *pcf, int irq); void pcf50633_irq_free(struct pcf50633 *pcf); -#ifdef CONFIG_PM -int pcf50633_irq_suspend(struct pcf50633 *pcf); -int pcf50633_irq_resume(struct pcf50633 *pcf); -#endif +extern const struct dev_pm_ops pcf50633_pm; #endif -- cgit From 4d8a6ae23af64a37803c0d15922819d27b4b8b08 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 23 Oct 2022 10:48:48 +0100 Subject: mfd: stmfx: Remove #ifdef guards for PM related functions Use the new DEFINE_SIMPLE_DEV_PM_OPS() and pm_sleep_ptr() macros to handle the .suspend/.resume callbacks. These macros allow the suspend and resume functions to be automatically dropped by the compiler when CONFIG_SUSPEND is disabled, without having to use #ifdef guards. This has the advantage of always compiling these functions in, independently of any Kconfig option. Thanks to that, bugs and other regressions are subsequently easier to catch. Signed-off-by: Paul Cercueil Signed-off-by: Lee Jones --- include/linux/mfd/stmfx.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stmfx.h b/include/linux/mfd/stmfx.h index 744dce63946e..967a2e486800 100644 --- a/include/linux/mfd/stmfx.h +++ b/include/linux/mfd/stmfx.h @@ -113,10 +113,8 @@ struct stmfx { struct irq_domain *irq_domain; struct mutex lock; /* IRQ bus lock */ u8 irq_src; -#ifdef CONFIG_PM u8 bkp_sysctrl; u8 bkp_irqoutpin; -#endif }; int stmfx_function_enable(struct stmfx *stmfx, u32 func); -- cgit From 74c17a0a49a6ad3b32cb130f25196d1f8d5d560e Mon Sep 17 00:00:00 2001 From: Jerome Neanne Date: Fri, 4 Nov 2022 16:23:09 +0100 Subject: mfd: tps65219: Add driver for TI TPS65219 PMIC The TPS65219 is a power management IC PMIC designed to supply a wide range of SoCs in both portable and stationary applications. Any SoC can control TPS65219 over a standard I2C interface. It contains the following components: - Regulators. - Over Temperature warning and Shut down. - GPIOs - Multi Function Pins (MFP) - power-button This patch adds support for tps65219 PMIC. At this time only the functionalities listed below are made available: - Regulators probe and functionalities - warm and cold reset support - SW shutdown support - Regulator warnings via IRQs - Power-button via IRQ Signed-off-by: Jerome Neanne Signed-off-by: Markus Schneider-Pargmann Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20221104152311.1098603-5-jneanne@baylibre.com --- include/linux/mfd/tps65219.h | 345 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 include/linux/mfd/tps65219.h (limited to 'include/linux') diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h new file mode 100644 index 000000000000..e6826e34e2a6 --- /dev/null +++ b/include/linux/mfd/tps65219.h @@ -0,0 +1,345 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions to access TPS65219 Power Management IC. + * + * Copyright (C) 2022 BayLibre Incorporated - https://www.baylibre.com/ + */ + +#ifndef MFD_TPS65219_H +#define MFD_TPS65219_H + +#include +#include +#include + +struct regmap; +struct regmap_irq_chip_data; + +#define TPS65219_1V35 1350000 +#define TPS65219_1V8 1800000 + +/* TPS chip id list */ +#define TPS65219 0xF0 + +/* I2C ID for TPS65219 part */ +#define TPS65219_I2C_ID 0x24 + +/* All register addresses */ +#define TPS65219_REG_TI_DEV_ID 0x00 +#define TPS65219_REG_NVM_ID 0x01 +#define TPS65219_REG_ENABLE_CTRL 0x02 +#define TPS65219_REG_BUCKS_CONFIG 0x03 +#define TPS65219_REG_LDO4_VOUT 0x04 +#define TPS65219_REG_LDO3_VOUT 0x05 +#define TPS65219_REG_LDO2_VOUT 0x06 +#define TPS65219_REG_LDO1_VOUT 0x07 +#define TPS65219_REG_BUCK3_VOUT 0x8 +#define TPS65219_REG_BUCK2_VOUT 0x9 +#define TPS65219_REG_BUCK1_VOUT 0xA +#define TPS65219_REG_LDO4_SEQUENCE_SLOT 0xB +#define TPS65219_REG_LDO3_SEQUENCE_SLOT 0xC +#define TPS65219_REG_LDO2_SEQUENCE_SLOT 0xD +#define TPS65219_REG_LDO1_SEQUENCE_SLOT 0xE +#define TPS65219_REG_BUCK3_SEQUENCE_SLOT 0xF +#define TPS65219_REG_BUCK2_SEQUENCE_SLOT 0x10 +#define TPS65219_REG_BUCK1_SEQUENCE_SLOT 0x11 +#define TPS65219_REG_nRST_SEQUENCE_SLOT 0x12 +#define TPS65219_REG_GPIO_SEQUENCE_SLOT 0x13 +#define TPS65219_REG_GPO2_SEQUENCE_SLOT 0x14 +#define TPS65219_REG_GPO1_SEQUENCE_SLOT 0x15 +#define TPS65219_REG_POWER_UP_SLOT_DURATION_1 0x16 +#define TPS65219_REG_POWER_UP_SLOT_DURATION_2 0x17 +#define TPS65219_REG_POWER_UP_SLOT_DURATION_3 0x18 +#define TPS65219_REG_POWER_UP_SLOT_DURATION_4 0x19 +#define TPS65219_REG_POWER_DOWN_SLOT_DURATION_1 0x1A +#define TPS65219_REG_POWER_DOWN_SLOT_DURATION_2 0x1B +#define TPS65219_REG_POWER_DOWN_SLOT_DURATION_3 0x1C +#define TPS65219_REG_POWER_DOWN_SLOT_DURATION_4 0x1D +#define TPS65219_REG_GENERAL_CONFIG 0x1E +#define TPS65219_REG_MFP_1_CONFIG 0x1F +#define TPS65219_REG_MFP_2_CONFIG 0x20 +#define TPS65219_REG_STBY_1_CONFIG 0x21 +#define TPS65219_REG_STBY_2_CONFIG 0x22 +#define TPS65219_REG_OC_DEGL_CONFIG 0x23 +/* 'sub irq' MASK registers */ +#define TPS65219_REG_INT_MASK_UV 0x24 +#define TPS65219_REG_MASK_CONFIG 0x25 + +#define TPS65219_REG_I2C_ADDRESS_REG 0x26 +#define TPS65219_REG_USER_GENERAL_NVM_STORAGE 0x27 +#define TPS65219_REG_MANUFACTURING_VER 0x28 +#define TPS65219_REG_MFP_CTRL 0x29 +#define TPS65219_REG_DISCHARGE_CONFIG 0x2A +/* main irq registers */ +#define TPS65219_REG_INT_SOURCE 0x2B +/* 'sub irq' registers */ +#define TPS65219_REG_INT_LDO_3_4 0x2C +#define TPS65219_REG_INT_LDO_1_2 0x2D +#define TPS65219_REG_INT_BUCK_3 0x2E +#define TPS65219_REG_INT_BUCK_1_2 0x2F +#define TPS65219_REG_INT_SYSTEM 0x30 +#define TPS65219_REG_INT_RV 0x31 +#define TPS65219_REG_INT_TIMEOUT_RV_SD 0x32 +#define TPS65219_REG_INT_PB 0x33 + +#define TPS65219_REG_INT_LDO_3_4_POS 0 +#define TPS65219_REG_INT_LDO_1_2_POS 1 +#define TPS65219_REG_INT_BUCK_3_POS 2 +#define TPS65219_REG_INT_BUCK_1_2_POS 3 +#define TPS65219_REG_INT_SYS_POS 4 +#define TPS65219_REG_INT_RV_POS 5 +#define TPS65219_REG_INT_TO_RV_POS 6 +#define TPS65219_REG_INT_PB_POS 7 + +#define TPS65219_REG_USER_NVM_CMD 0x34 +#define TPS65219_REG_POWER_UP_STATUS 0x35 +#define TPS65219_REG_SPARE_2 0x36 +#define TPS65219_REG_SPARE_3 0x37 +#define TPS65219_REG_FACTORY_CONFIG_2 0x41 + +/* Register field definitions */ +#define TPS65219_DEVID_REV_MASK GENMASK(7, 0) +#define TPS65219_BUCKS_LDOS_VOUT_VSET_MASK GENMASK(5, 0) +#define TPS65219_BUCKS_UV_THR_SEL_MASK BIT(6) +#define TPS65219_BUCKS_BW_SEL_MASK BIT(7) +#define LDO_BYP_SHIFT 6 +#define TPS65219_LDOS_BYP_CONFIG_MASK BIT(LDO_BYP_SHIFT) +#define TPS65219_LDOS_LSW_CONFIG_MASK BIT(7) +/* Regulators enable control */ +#define TPS65219_ENABLE_BUCK1_EN_MASK BIT(0) +#define TPS65219_ENABLE_BUCK2_EN_MASK BIT(1) +#define TPS65219_ENABLE_BUCK3_EN_MASK BIT(2) +#define TPS65219_ENABLE_LDO1_EN_MASK BIT(3) +#define TPS65219_ENABLE_LDO2_EN_MASK BIT(4) +#define TPS65219_ENABLE_LDO3_EN_MASK BIT(5) +#define TPS65219_ENABLE_LDO4_EN_MASK BIT(6) +/* power ON-OFF sequence slot */ +#define TPS65219_BUCKS_LDOS_SEQUENCE_OFF_SLOT_MASK GENMASK(3, 0) +#define TPS65219_BUCKS_LDOS_SEQUENCE_ON_SLOT_MASK GENMASK(7, 4) +/* TODO: Not needed, same mapping as TPS65219_ENABLE_REGNAME_EN, factorize */ +#define TPS65219_STBY1_BUCK1_STBY_EN_MASK BIT(0) +#define TPS65219_STBY1_BUCK2_STBY_EN_MASK BIT(1) +#define TPS65219_STBY1_BUCK3_STBY_EN_MASK BIT(2) +#define TPS65219_STBY1_LDO1_STBY_EN_MASK BIT(3) +#define TPS65219_STBY1_LDO2_STBY_EN_MASK BIT(4) +#define TPS65219_STBY1_LDO3_STBY_EN_MASK BIT(5) +#define TPS65219_STBY1_LDO4_STBY_EN_MASK BIT(6) +/* STBY_2 config */ +#define TPS65219_STBY2_GPO1_STBY_EN_MASK BIT(0) +#define TPS65219_STBY2_GPO2_STBY_EN_MASK BIT(1) +#define TPS65219_STBY2_GPIO_STBY_EN_MASK BIT(2) +/* MFP Control */ +#define TPS65219_MFP_I2C_OFF_REQ_MASK BIT(0) +#define TPS65219_MFP_STBY_I2C_CTRL_MASK BIT(1) +#define TPS65219_MFP_COLD_RESET_I2C_CTRL_MASK BIT(2) +#define TPS65219_MFP_WARM_RESET_I2C_CTRL_MASK BIT(3) +#define TPS65219_MFP_GPIO_STATUS_MASK BIT(4) +/* MFP_1 Config */ +#define TPS65219_MFP_1_VSEL_DDR_SEL_MASK BIT(0) +#define TPS65219_MFP_1_VSEL_SD_POL_MASK BIT(1) +#define TPS65219_MFP_1_VSEL_RAIL_MASK BIT(2) +/* MFP_2 Config */ +#define TPS65219_MFP_2_MODE_STBY_MASK GENMASK(1, 0) +#define TPS65219_MFP_2_MODE_RESET_MASK BIT(2) +#define TPS65219_MFP_2_EN_PB_VSENSE_DEGL_MASK BIT(3) +#define TPS65219_MFP_2_EN_PB_VSENSE_MASK GENMASK(5, 4) +#define TPS65219_MFP_2_WARM_COLD_RESET_MASK BIT(6) +#define TPS65219_MFP_2_PU_ON_FSD_MASK BIT(7) +#define TPS65219_MFP_2_EN 0 +#define TPS65219_MFP_2_PB BIT(4) +#define TPS65219_MFP_2_VSENSE BIT(5) +/* MASK_UV Config */ +#define TPS65219_REG_MASK_UV_LDO1_UV_MASK BIT(0) +#define TPS65219_REG_MASK_UV_LDO2_UV_MASK BIT(1) +#define TPS65219_REG_MASK_UV_LDO3_UV_MASK BIT(2) +#define TPS65219_REG_MASK_UV_LDO4_UV_MASK BIT(3) +#define TPS65219_REG_MASK_UV_BUCK1_UV_MASK BIT(4) +#define TPS65219_REG_MASK_UV_BUCK2_UV_MASK BIT(5) +#define TPS65219_REG_MASK_UV_BUCK3_UV_MASK BIT(6) +#define TPS65219_REG_MASK_UV_RETRY_MASK BIT(7) +/* MASK Config */ +// SENSOR_N_WARM_MASK already defined in Thermal +#define TPS65219_REG_MASK_INT_FOR_RV_MASK BIT(4) +#define TPS65219_REG_MASK_EFFECT_MASK GENMASK(2, 1) +#define TPS65219_REG_MASK_INT_FOR_PB_MASK BIT(7) +/* UnderVoltage - Short to GND - OverCurrent*/ +/* LDO3-4 */ +#define TPS65219_INT_LDO3_SCG_MASK BIT(0) +#define TPS65219_INT_LDO3_OC_MASK BIT(1) +#define TPS65219_INT_LDO3_UV_MASK BIT(2) +#define TPS65219_INT_LDO4_SCG_MASK BIT(3) +#define TPS65219_INT_LDO4_OC_MASK BIT(4) +#define TPS65219_INT_LDO4_UV_MASK BIT(5) +/* LDO1-2 */ +#define TPS65219_INT_LDO1_SCG_MASK BIT(0) +#define TPS65219_INT_LDO1_OC_MASK BIT(1) +#define TPS65219_INT_LDO1_UV_MASK BIT(2) +#define TPS65219_INT_LDO2_SCG_MASK BIT(3) +#define TPS65219_INT_LDO2_OC_MASK BIT(4) +#define TPS65219_INT_LDO2_UV_MASK BIT(5) +/* BUCK3 */ +#define TPS65219_INT_BUCK3_SCG_MASK BIT(0) +#define TPS65219_INT_BUCK3_OC_MASK BIT(1) +#define TPS65219_INT_BUCK3_NEG_OC_MASK BIT(2) +#define TPS65219_INT_BUCK3_UV_MASK BIT(3) +/* BUCK1-2 */ +#define TPS65219_INT_BUCK1_SCG_MASK BIT(0) +#define TPS65219_INT_BUCK1_OC_MASK BIT(1) +#define TPS65219_INT_BUCK1_NEG_OC_MASK BIT(2) +#define TPS65219_INT_BUCK1_UV_MASK BIT(3) +#define TPS65219_INT_BUCK2_SCG_MASK BIT(4) +#define TPS65219_INT_BUCK2_OC_MASK BIT(5) +#define TPS65219_INT_BUCK2_NEG_OC_MASK BIT(6) +#define TPS65219_INT_BUCK2_UV_MASK BIT(7) +/* Thermal Sensor */ +#define TPS65219_INT_SENSOR_3_WARM_MASK BIT(0) +#define TPS65219_INT_SENSOR_2_WARM_MASK BIT(1) +#define TPS65219_INT_SENSOR_1_WARM_MASK BIT(2) +#define TPS65219_INT_SENSOR_0_WARM_MASK BIT(3) +#define TPS65219_INT_SENSOR_3_HOT_MASK BIT(4) +#define TPS65219_INT_SENSOR_2_HOT_MASK BIT(5) +#define TPS65219_INT_SENSOR_1_HOT_MASK BIT(6) +#define TPS65219_INT_SENSOR_0_HOT_MASK BIT(7) +/* Residual Voltage */ +#define TPS65219_INT_BUCK1_RV_MASK BIT(0) +#define TPS65219_INT_BUCK2_RV_MASK BIT(1) +#define TPS65219_INT_BUCK3_RV_MASK BIT(2) +#define TPS65219_INT_LDO1_RV_MASK BIT(3) +#define TPS65219_INT_LDO2_RV_MASK BIT(4) +#define TPS65219_INT_LDO3_RV_MASK BIT(5) +#define TPS65219_INT_LDO4_RV_MASK BIT(6) +/* Residual Voltage ShutDown */ +#define TPS65219_INT_BUCK1_RV_SD_MASK BIT(0) +#define TPS65219_INT_BUCK2_RV_SD_MASK BIT(1) +#define TPS65219_INT_BUCK3_RV_SD_MASK BIT(2) +#define TPS65219_INT_LDO1_RV_SD_MASK BIT(3) +#define TPS65219_INT_LDO2_RV_SD_MASK BIT(4) +#define TPS65219_INT_LDO3_RV_SD_MASK BIT(5) +#define TPS65219_INT_LDO4_RV_SD_MASK BIT(6) +#define TPS65219_INT_TIMEOUT_MASK BIT(7) +/* Power Button */ +#define TPS65219_INT_PB_FALLING_EDGE_DETECT_MASK BIT(0) +#define TPS65219_INT_PB_RISING_EDGE_DETECT_MASK BIT(1) +#define TPS65219_INT_PB_REAL_TIME_STATUS_MASK BIT(2) + +#define TPS65219_PB_POS 7 +#define TPS65219_TO_RV_POS 6 +#define TPS65219_RV_POS 5 +#define TPS65219_SYS_POS 4 +#define TPS65219_BUCK_1_2_POS 3 +#define TPS65219_BUCK_3_POS 2 +#define TPS65219_LDO_1_2_POS 1 +#define TPS65219_LDO_3_4_POS 0 + +/* IRQs */ +enum { + /* LDO3-4 register IRQs */ + TPS65219_INT_LDO3_SCG, + TPS65219_INT_LDO3_OC, + TPS65219_INT_LDO3_UV, + TPS65219_INT_LDO4_SCG, + TPS65219_INT_LDO4_OC, + TPS65219_INT_LDO4_UV, + /* LDO1-2 */ + TPS65219_INT_LDO1_SCG, + TPS65219_INT_LDO1_OC, + TPS65219_INT_LDO1_UV, + TPS65219_INT_LDO2_SCG, + TPS65219_INT_LDO2_OC, + TPS65219_INT_LDO2_UV, + /* BUCK3 */ + TPS65219_INT_BUCK3_SCG, + TPS65219_INT_BUCK3_OC, + TPS65219_INT_BUCK3_NEG_OC, + TPS65219_INT_BUCK3_UV, + /* BUCK1-2 */ + TPS65219_INT_BUCK1_SCG, + TPS65219_INT_BUCK1_OC, + TPS65219_INT_BUCK1_NEG_OC, + TPS65219_INT_BUCK1_UV, + TPS65219_INT_BUCK2_SCG, + TPS65219_INT_BUCK2_OC, + TPS65219_INT_BUCK2_NEG_OC, + TPS65219_INT_BUCK2_UV, + /* Thermal Sensor */ + TPS65219_INT_SENSOR_3_WARM, + TPS65219_INT_SENSOR_2_WARM, + TPS65219_INT_SENSOR_1_WARM, + TPS65219_INT_SENSOR_0_WARM, + TPS65219_INT_SENSOR_3_HOT, + TPS65219_INT_SENSOR_2_HOT, + TPS65219_INT_SENSOR_1_HOT, + TPS65219_INT_SENSOR_0_HOT, + /* Residual Voltage */ + TPS65219_INT_BUCK1_RV, + TPS65219_INT_BUCK2_RV, + TPS65219_INT_BUCK3_RV, + TPS65219_INT_LDO1_RV, + TPS65219_INT_LDO2_RV, + TPS65219_INT_LDO3_RV, + TPS65219_INT_LDO4_RV, + /* Residual Voltage ShutDown */ + TPS65219_INT_BUCK1_RV_SD, + TPS65219_INT_BUCK2_RV_SD, + TPS65219_INT_BUCK3_RV_SD, + TPS65219_INT_LDO1_RV_SD, + TPS65219_INT_LDO2_RV_SD, + TPS65219_INT_LDO3_RV_SD, + TPS65219_INT_LDO4_RV_SD, + TPS65219_INT_TIMEOUT, + /* Power Button */ + TPS65219_INT_PB_FALLING_EDGE_DETECT, + TPS65219_INT_PB_RISING_EDGE_DETECT, +}; + +enum tps65219_regulator_id { + /* DCDC's */ + TPS65219_BUCK_1, + TPS65219_BUCK_2, + TPS65219_BUCK_3, + /* LDOs */ + TPS65219_LDO_1, + TPS65219_LDO_2, + TPS65219_LDO_3, + TPS65219_LDO_4, +}; + +/* Number of step-down converters available */ +#define TPS65219_NUM_DCDC 3 +/* Number of LDO voltage regulators available */ +#define TPS65219_NUM_LDO 4 +/* Number of total regulators available */ +#define TPS65219_NUM_REGULATOR (TPS65219_NUM_DCDC + TPS65219_NUM_LDO) + +/* Define the TPS65219 IRQ numbers */ +enum tps65219_irqs { + /* INT source registers */ + TPS65219_TO_RV_SD_SET_IRQ, + TPS65219_RV_SET_IRQ, + TPS65219_SYS_SET_IRQ, + TPS65219_BUCK_1_2_SET_IRQ, + TPS65219_BUCK_3_SET_IRQ, + TPS65219_LDO_1_2_SET_IRQ, + TPS65219_LDO_3_4_SET_IRQ, + TPS65219_PB_SET_IRQ, +}; + +/** + * struct tps65219 - tps65219 sub-driver chip access routines + * + * Device data may be used to access the TPS65219 chip + * + * @dev: MFD device + * @regmap: Regmap for accessing the device registers + * @irq_data: Regmap irq data used for the irq chip + * @nb: notifier block for the restart handler + */ +struct tps65219 { + struct device *dev; + struct regmap *regmap; + + struct regmap_irq_chip_data *irq_data; + struct notifier_block nb; +}; + +#endif /* MFD_TPS65219_H */ -- cgit From e6aeb2721d3bad8379c43644d0380908e93b0187 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:30 +0000 Subject: io_uring: complete all requests in task context This patch adds ctx->task_complete flag. If set, we'll complete all requests in the context of the original task. Note, this extends to completion CQE posting only but not io_kiocb cleanup / free, e.g. io-wq may free the requests in the free calllback. This flag will be used later for optimisations purposes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/21ece72953f76bb2e77659a72a14326227ab6460.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 ++ include/linux/io_uring_types.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 29e519752da4..934e5dd4ccc0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -11,6 +11,8 @@ enum io_uring_cmd_flags { IO_URING_F_UNLOCKED = 2, /* the request is executed from poll, it should not be freed */ IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, /* int's last bit, sign checks are usually faster than a bit test */ IO_URING_F_NONBLOCK = INT_MIN, diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index accdfecee953..6be1e1359c89 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -208,6 +208,8 @@ struct io_ring_ctx { unsigned int drain_disabled: 1; unsigned int has_evfd: 1; unsigned int syscall_iopoll: 1; + /* all CQEs should be posted only by the submitter task */ + unsigned int task_complete: 1; } ____cacheline_aligned_in_smp; /* submission data */ -- cgit From d34b1b0b6779d4f5ee877b53cad90eef0f1cbe34 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:32 +0000 Subject: io_uring: use tw for putting rsrc Use task_work for completing rsrc removals, it'll be needed later for spinlock optimisations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cbba5d53a11ee6fc2194dacea262c1d733c8b529.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6be1e1359c89..dcd8a563ab52 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -328,6 +328,7 @@ struct io_ring_ctx { struct io_rsrc_data *buf_data; struct delayed_work rsrc_put_work; + struct callback_head rsrc_put_tw; struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; -- cgit From 5b481acab4ce017fda8166fa9428511da41109e5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Dec 2022 15:59:32 +0100 Subject: bpf: do not rely on ALLOW_ERROR_INJECTION for fmod_ret The current way of expressing that a non-bpf kernel component is willing to accept that bpf programs can be attached to it and that they can change the return value is to abuse ALLOW_ERROR_INJECTION. This is debated in the link below, and the result is that it is not a reasonable thing to do. Reuse the kfunc declaration structure to also tag the kernel functions we want to be fmodret. This way we can control from any subsystem which functions are being modified by bpf without touching the verifier. Link: https://lore.kernel.org/all/20221121104403.1545f9b5@gandalf.local.home/ Suggested-by: Alexei Starovoitov Signed-off-by: Benjamin Tissoires Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20221206145936.922196-2-benjamin.tissoires@redhat.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..6a0808e7845c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -412,8 +412,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, u32 kfunc_btf_id); +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); -- cgit From c34b7ac65087554627f4840f4ecd6f2107a68fd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Dec 2022 15:40:57 +0100 Subject: block: remove bio_set_op_attrs This macro is obsolete, so replace the last few uses with open coded bi_opf assignments. Signed-off-by: Christoph Hellwig Acked-by: Coly Li > Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20221206144057.720846-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e0b098089ef2..99be590f952f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -472,13 +472,6 @@ static inline enum req_op bio_op(const struct bio *bio) return bio->bi_opf & REQ_OP_MASK; } -/* obsolete, don't use in new code */ -static inline void bio_set_op_attrs(struct bio *bio, enum req_op op, - blk_opf_t op_flags) -{ - bio->bi_opf = op | op_flags; -} - static inline bool op_is_write(blk_opf_t op) { return !!(op & (__force blk_opf_t)1); -- cgit From ef13f8b64728c4b4d28639bbcf30fe1314b18482 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 26 Oct 2022 15:46:58 +0200 Subject: clk: Store clk_core for clk_rate_request The struct clk_rate_request is meant to store the context around a rate request such as the parent, boundaries, and so on. However, it doesn't store the clock the rate request is submitted to, which makes debugging difficult. Let's add a pointer to the relevant clk_core instance in order to improve the debugging of rate requests in a subsequent patch. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20221018-clk-rate-request-tracing-v2-1-5170b363c413@cerno.tech Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 267cd06b54a0..842e72a5348f 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -44,6 +44,7 @@ struct dentry; * * Should be initialized by calling clk_hw_init_rate_request(). * + * @core: Pointer to the struct clk_core affected by this request * @rate: Requested clock rate. This field will be adjusted by * clock drivers according to hardware capabilities. * @min_rate: Minimum rate imposed by clk users. @@ -55,6 +56,7 @@ struct dentry; * */ struct clk_rate_request { + struct clk_core *core; unsigned long rate; unsigned long min_rate; unsigned long max_rate; -- cgit From df268f6ca7da1088d7ecff4250d1478d3fa2406b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 20:51:12 +0200 Subject: net/mlx5: Introduce IFC bits for migratable Introduce IFC related capabilities to enable setting VF to be able to perform live migration. e.g.: to be migratable. Signed-off-by: Yishai Hadas Reviewed-by: Mark Bloch Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..2093131483c7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -68,6 +68,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2 = 0x20, MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION = 0x25, }; @@ -1875,7 +1876,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { }; struct mlx5_ifc_cmd_hca_cap_2_bits { - u8 reserved_at_0[0xa0]; + u8 reserved_at_0[0x80]; + + u8 migratable[0x1]; + u8 reserved_at_81[0x1f]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; -- cgit From 47d0c500d76c7b1d220c47cd875763fef04eba2e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:16 +0200 Subject: net/mlx5: Add generic getters for other functions caps Downstream patch requires to get other function GENERAL2 caps while mlx5_vport_get_other_func_cap() gets only one type of caps (general). Rename it to represent this and introduce a generic implementation of mlx5_vport_get_other_func_cap(). Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/vport.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index aad53cb72f17..7f31432f44c2 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -132,4 +132,6 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev); +int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out, + u16 opmod); #endif /* __MLX5_VPORT_H__ */ -- cgit From 8f3cbcd6b440032ebc7f7d48a1689dcc70a4eb98 Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Tue, 6 Dec 2022 15:22:21 +0800 Subject: regulator: core: Use different devices for resource allocation and DT lookup Following by the below discussion, there's the potential UAF issue between regulator and mfd. https://lore.kernel.org/all/20221128143601.1698148-1-yangyingliang@huawei.com/ From the analysis of Yingliang CPU A |CPU B mt6370_probe() | devm_mfd_add_devices() | |mt6370_regulator_probe() | regulator_register() | //allocate init_data and add it to devres | regulator_of_get_init_data() i2c_unregister_device() | device_del() | devres_release_all() | // init_data is freed | release_nodes() | | // using init_data causes UAF | regulator_register() It's common to use mfd core to create child device for the regulator. In order to do the DT lookup for init data, the child that registered the regulator would pass its parent as the parameter. And this causes init data resource allocated to its parent, not itself. The issue happen when parent device is going to release and regulator core is still doing some operation of init data constraint for the regulator of child device. To fix it, this patch expand 'regulator_register' API to use the different devices for init data allocation and DT lookup. Reported-by: Yang Yingliang Signed-off-by: ChiYuan Huang Link: https://lore.kernel.org/r/1670311341-32664-1-git-send-email-u0084500@gmail.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index f9a7461e72b8..d3b4a3d4514a 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -687,7 +687,8 @@ static inline int regulator_err2notif(int err) struct regulator_dev * -regulator_register(const struct regulator_desc *regulator_desc, +regulator_register(struct device *dev, + const struct regulator_desc *regulator_desc, const struct regulator_config *config); struct regulator_dev * devm_regulator_register(struct device *dev, -- cgit From 56fb8d90031f71fa8af48fdff8498b9263b9c759 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 5 Dec 2022 20:16:48 +0100 Subject: block: sed-opal: Don't include There is no need to include here. Prefer the less invasive and which are needed in this .h file itself. Signed-off-by: Christophe JAILLET Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/c1d479b39e30fe70c4579a1af035d4db49421f56.1670069909.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 6f837bb6c715..31ac562a17d7 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -11,7 +11,8 @@ #define LINUX_OPAL_H #include -#include +#include +#include struct opal_dev; -- cgit From 04593028d7c1156df9aa69841980529d7f20f9fe Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 30 Sep 2022 22:51:40 -0700 Subject: tpm: st33zp24: drop support for platform data Drop support for platform data from the driver because there are no users of st33zp24_platform_data structure in the mainline kernel. Signed-off-by: Dmitry Torokhov Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/platform_data/st33zp24.h | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 include/linux/platform_data/st33zp24.h (limited to 'include/linux') diff --git a/include/linux/platform_data/st33zp24.h b/include/linux/platform_data/st33zp24.h deleted file mode 100644 index 61db674f36cc..000000000000 --- a/include/linux/platform_data/st33zp24.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * STMicroelectronics TPM Linux driver for TPM 1.2 ST33ZP24 - * Copyright (C) 2009 - 2016 STMicroelectronics - */ -#ifndef __ST33ZP24_H__ -#define __ST33ZP24_H__ - -#define TPM_ST33_I2C "st33zp24-i2c" -#define TPM_ST33_SPI "st33zp24-spi" - -struct st33zp24_platform_data { - int io_lpcpd; -}; - -#endif /* __ST33ZP24_H__ */ -- cgit From fbf8321238bac04368f57af572e05a9c01347a0b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 Dec 2022 16:53:15 -1000 Subject: memcg: Fix possible use-after-free in memcg_write_event_control() memcg_write_event_control() accesses the dentry->d_name of the specified control fd to route the write call. As a cgroup interface file can't be renamed, it's safe to access d_name as long as the specified file is a regular cgroup file. Also, as these cgroup interface files can't be removed before the directory, it's safe to access the parent too. Prior to 347c4a874710 ("memcg: remove cgroup_event->cft"), there was a call to __file_cft() which verified that the specified file is a regular cgroupfs file before further accesses. The cftype pointer returned from __file_cft() was no longer necessary and the commit inadvertently dropped the file type check with it allowing any file to slip through. With the invarients broken, the d_name and parent accesses can now race against renames and removals of arbitrary files and cause use-after-free's. Fix the bug by resurrecting the file type check in __file_cft(). Now that cgroupfs is implemented through kernfs, checking the file operations needs to go through a layer of indirection. Instead, let's check the superblock and dentry type. Signed-off-by: Tejun Heo Fixes: 347c4a874710 ("memcg: remove cgroup_event->cft") Cc: stable@kernel.org # v3.14+ Reported-by: Jann Horn Acked-by: Johannes Weiner Acked-by: Roman Gushchin Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44b59e2..2b7d077de7ef 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -68,6 +68,7 @@ struct css_task_iter { struct list_head iters_node; /* css_set->task_iters */ }; +extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; -- cgit From f1543c7abab25d93bc8e9fae79b4cb3153ed6669 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 31 May 2022 13:55:28 +0300 Subject: net/mlx5: mlx5_ifc updates for MATCH_DEFINER general object Update full structure of match definer and add an ID of the SELECT match definer type. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2093131483c7..294cfe175c4b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6108,6 +6108,38 @@ struct mlx5_ifc_match_definer_format_32_bits { u8 inner_dmac_15_0[0x10]; }; +enum { + MLX5_IFC_DEFINER_FORMAT_ID_SELECT = 61, +}; + +#define MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED 0x0 +#define MLX5_IFC_DEFINER_FORMAT_OFFSET_OUTER_ETH_PKT_LEN 0x48 +#define MLX5_IFC_DEFINER_DW_SELECTORS_NUM 9 +#define MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM 8 + +struct mlx5_ifc_match_definer_match_mask_bits { + u8 reserved_at_1c0[5][0x20]; + u8 match_dw_8[0x20]; + u8 match_dw_7[0x20]; + u8 match_dw_6[0x20]; + u8 match_dw_5[0x20]; + u8 match_dw_4[0x20]; + u8 match_dw_3[0x20]; + u8 match_dw_2[0x20]; + u8 match_dw_1[0x20]; + u8 match_dw_0[0x20]; + + u8 match_byte_7[0x8]; + u8 match_byte_6[0x8]; + u8 match_byte_5[0x8]; + u8 match_byte_4[0x8]; + + u8 match_byte_3[0x8]; + u8 match_byte_2[0x8]; + u8 match_byte_1[0x8]; + u8 match_byte_0[0x8]; +}; + struct mlx5_ifc_match_definer_bits { u8 modify_field_select[0x40]; @@ -6116,9 +6148,41 @@ struct mlx5_ifc_match_definer_bits { u8 reserved_at_80[0x10]; u8 format_id[0x10]; - u8 reserved_at_a0[0x160]; + u8 reserved_at_a0[0x60]; - u8 match_mask[16][0x20]; + u8 format_select_dw3[0x8]; + u8 format_select_dw2[0x8]; + u8 format_select_dw1[0x8]; + u8 format_select_dw0[0x8]; + + u8 format_select_dw7[0x8]; + u8 format_select_dw6[0x8]; + u8 format_select_dw5[0x8]; + u8 format_select_dw4[0x8]; + + u8 reserved_at_100[0x18]; + u8 format_select_dw8[0x8]; + + u8 reserved_at_120[0x20]; + + u8 format_select_byte3[0x8]; + u8 format_select_byte2[0x8]; + u8 format_select_byte1[0x8]; + u8 format_select_byte0[0x8]; + + u8 format_select_byte7[0x8]; + u8 format_select_byte6[0x8]; + u8 format_select_byte5[0x8]; + u8 format_select_byte4[0x8]; + + u8 reserved_at_180[0x40]; + + union { + struct { + u8 match_mask[16][0x20]; + }; + struct mlx5_ifc_match_definer_match_mask_bits match_mask_format; + }; }; struct mlx5_ifc_general_obj_in_cmd_hdr_bits { -- cgit From 38bf24c38d19dc4ba5ec684ac2afa2f8e256db1e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 15 Aug 2022 12:45:28 +0300 Subject: net/mlx5: fs, add match on ranges API Range is a new flow destination type which allows matching on a range of values instead of matching on a specific value. Range flow destination has the following fields: - hit_ft: flow table to forward the traffic in case of hit - miss_ft: flow table to forward the traffic in case of miss - field: which packet characteristic to match on - min: minimal value for the selected field - max: maximal value for the selected field Note: - In order to match, the value in the packet should meet the following criteria: min <= value < max - Currently, the only supported field type is L2 packet length Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index c7a91981cd5a..ba6958b49a8e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -50,6 +50,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_PORT, MLX5_FLOW_DESTINATION_TYPE_COUNTER, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, + MLX5_FLOW_DESTINATION_TYPE_RANGE, }; enum { @@ -143,6 +144,10 @@ enum { MLX5_FLOW_DEST_VPORT_REFORMAT_ID = BIT(1), }; +enum mlx5_flow_dest_range_field { + MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN = 0, +}; + struct mlx5_flow_destination { enum mlx5_flow_destination_type type; union { @@ -156,6 +161,13 @@ struct mlx5_flow_destination { struct mlx5_pkt_reformat *pkt_reformat; u8 flags; } vport; + struct { + struct mlx5_flow_table *hit_ft; + struct mlx5_flow_table *miss_ft; + enum mlx5_flow_dest_range_field field; + u32 min; + u32 max; + } range; u32 sampler_id; }; }; -- cgit From 6b75bd3d036745b9be30917909f03602099adbdb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:35 +0530 Subject: bpf: Refactor ARG_PTR_TO_DYNPTR checks into process_dynptr_func ARG_PTR_TO_DYNPTR is akin to ARG_PTR_TO_TIMER, ARG_PTR_TO_KPTR, where the underlying register type is subjected to more special checks to determine the type of object represented by the pointer and its state consistency. Move dynptr checks to their own 'process_dynptr_func' function so that is consistent and in-line with existing code. This also makes it easier to reuse this code for kfunc handling. Then, reuse this consolidated function in kfunc dynptr handling too. Note that for kfuncs, the arg_type constraint of DYNPTR_TYPE_LOCAL has been lifted. Acked-by: David Vernet Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 70d06a99f0b8..df0cb825e0e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -615,11 +615,9 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg); -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type); +struct bpf_call_arg_meta; +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, -- cgit From 270605317366e4535d8d9fc3d9da1ad0fb3c9d45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:37 +0530 Subject: bpf: Rework process_dynptr_func Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type for use in callback state, because in case of user ringbuf helpers, there is no dynptr on the stack that is passed into the callback. To reflect such a state, a special register type was created. However, some checks have been bypassed incorrectly during the addition of this feature. First, for arg_type with MEM_UNINIT flag which initialize a dynptr, they must be rejected for such register type. Secondly, in the future, there are plans to add dynptr helpers that operate on the dynptr itself and may change its offset and other properties. In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed to such helpers, however the current code simply returns 0. The rejection for helpers that release the dynptr is already handled. For fixing this, we take a step back and rework existing code in a way that will allow fitting in all classes of helpers and have a coherent model for dealing with the variety of use cases in which dynptr is used. First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together with a DYNPTR_TYPE_* constant that denotes the only type it accepts. Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this fact. To make the distinction clear, use MEM_RDONLY flag to indicate that the helper only operates on the memory pointed to by the dynptr, not the dynptr itself. In C parlance, it would be equivalent to taking the dynptr as a point to const argument. When either of these flags are not present, the helper is allowed to mutate both the dynptr itself and also the memory it points to. Currently, the read only status of the memory is not tracked in the dynptr, but it would be trivial to add this support inside dynptr state of the register. With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to better reflect its usage, it can no longer be passed to helpers that initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr. A note to reviewers is that in code that does mark_stack_slots_dynptr, and unmark_stack_slots_dynptr, we implicitly rely on the fact that PTR_TO_STACK reg is the only case that can reach that code path, as one cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In both cases such helpers won't be setting that flag. The next patch will add a couple of selftest cases to make sure this doesn't break. Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper") Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4920ac252754..3de24cfb7a3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -775,7 +775,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_DYNPTR, /* reg points to a dynptr */ + CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -2828,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); -- cgit From a44e84a9b7764c72896f7241a0ec9ac7e7ef38dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Nov 2022 20:39:50 +0100 Subject: ext4: fix deadlock due to mbcache entry corruption When manipulating xattr blocks, we can deadlock infinitely looping inside ext4_xattr_block_set() where we constantly keep finding xattr block for reuse in mbcache but we are unable to reuse it because its reference count is too big. This happens because cache entry for the xattr block is marked as reusable (e_reusable set) although its reference count is too big. When this inconsistency happens, this inconsistent state is kept indefinitely and so ext4_xattr_block_set() keeps retrying indefinitely. The inconsistent state is caused by non-atomic update of e_reusable bit. e_reusable is part of a bitfield and e_reusable update can race with update of e_referenced bit in the same bitfield resulting in loss of one of the updates. Fix the problem by using atomic bitops instead. This bug has been around for many years, but it became *much* easier to hit after commit 65f8b80053a1 ("ext4: fix race when reusing xattr blocks"). Cc: stable@vger.kernel.org Fixes: 6048c64b2609 ("mbcache: add reusable flag to cache entries") Fixes: 65f8b80053a1 ("ext4: fix race when reusing xattr blocks") Reported-and-tested-by: Jeremi Piotrowski Reported-by: Thilo Fromm Link: https://lore.kernel.org/r/c77bf00f-4618-7149-56f1-b8d1664b9d07@linux.microsoft.com/ Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20221123193950.16758-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- include/linux/mbcache.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 2da63fd7b98f..97e64184767d 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -10,6 +10,12 @@ struct mb_cache; +/* Cache entry flags */ +enum { + MBE_REFERENCED_B = 0, + MBE_REUSABLE_B +}; + struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; @@ -26,8 +32,7 @@ struct mb_cache_entry { atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; - u32 e_referenced:1; - u32 e_reusable:1; + unsigned long e_flags; /* User provided value - stable during lifetime of the entry */ u64 e_value; }; -- cgit From f30ff35f6266993405c8659e48fddc3180692164 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:12 +0100 Subject: jbd2: switch jbd2_submit_inode_data() to use fs-provided hook for data writeout jbd2_submit_inode_data() hardcoded use of jbd2_journal_submit_inode_data_buffers() for submission of data pages. Make it use j_submit_inode_data_buffers hook instead. This effectively switches ext4 fastcommits to use ext4_writepages() for data writeout instead of generic_writepages(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0b7242370b56..2170e0cc279d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1662,7 +1662,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); -int jbd2_submit_inode_data(struct jbd2_inode *jinode); +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); -- cgit From e47877c7aa821c413b45e05f804819579bdfa1a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 6 Dec 2022 11:36:32 -1000 Subject: rhashtable: Allow rhashtable to be used from irq-safe contexts rhashtable currently only does bh-safe synchronization making it impossible to use from irq-safe contexts. Switch it to use irq-safe synchronization to remove the restriction. v2: Update the lock functions to return the ulong flags value and unlock functions to take the value directly instead of passing around the pointer. Suggested by Linus. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Linus Torvalds Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 61 +++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 68dab3e08aad..5b5357c0bd8c 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -323,29 +323,36 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert( * When we write to a bucket without unlocking, we use rht_assign_locked(). */ -static inline void rht_lock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) +static inline unsigned long rht_lock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); + return flags; } -static inline void rht_lock_nested(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bucket, - unsigned int subclass) +static inline unsigned long rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bucket, + unsigned int subclass) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); + return flags; } static inline void rht_unlock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) + struct rhash_lock_head __rcu **bkt, + unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); - local_bh_enable(); + local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( @@ -393,7 +400,8 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, - struct rhash_head *obj) + struct rhash_head *obj, + unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; @@ -401,7 +409,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); - local_bh_enable(); + local_irq_restore(flags); } /** @@ -706,6 +714,7 @@ static inline void *__rhashtable_insert_fast( struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; + unsigned long flags; unsigned int hash; int elasticity; void *data; @@ -720,11 +729,11 @@ static inline void *__rhashtable_insert_fast( if (!bkt) goto out; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } @@ -756,9 +765,9 @@ slow_path: RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } @@ -785,7 +794,7 @@ slow_path: } atomic_inc(&ht->nelems); - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); @@ -797,7 +806,7 @@ out: return data; out_unlock: - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); goto out; } @@ -991,6 +1000,7 @@ static inline int __rhashtable_remove_fast_one( struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; + unsigned long flags; unsigned int hash; int err = -ENOENT; @@ -999,7 +1009,7 @@ static inline int __rhashtable_remove_fast_one( if (!bkt) return -ENOENT; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; @@ -1043,14 +1053,14 @@ static inline int __rhashtable_remove_fast_one( if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else { - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); @@ -1143,6 +1153,7 @@ static inline int __rhashtable_replace_fast( struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; + unsigned long flags; unsigned int hash; int err = -ENOENT; @@ -1158,7 +1169,7 @@ static inline int __rhashtable_replace_fast( return -ENOENT; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { @@ -1169,15 +1180,15 @@ static inline int __rhashtable_replace_fast( rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else { - rht_assign_unlock(tbl, bkt, obj_new); + rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); unlocked: return err; -- cgit From 577cc1434e4cc1342c3df6d6a3c85136ab335c81 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 9 Dec 2022 09:29:35 +0100 Subject: lsm: Fix description of fs_context_parse_param The fs_context_parse_param hook already has a description, which seems the right one according to the code. Fixes: 8eb687bc8069 ("lsm: Add/fix return values in lsm_hooks.h and fix formatting") Signed-off-by: Roberto Sassu Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index bad3b6baad86..20e70132584c 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -100,9 +100,6 @@ * the filesystem. * @fc indicates the filesystem context. * @param The parameter. - * Return 0 to indicate that the parameter should be passed on to the - * filesystem, 1 to indicate that the parameter should be discarded or an - * error to indicate that the parameter should be rejected. * * Security hooks for filesystem operations. * -- cgit From 69af4bcaa08d06fd4d788a7f7193fb3c40ac6aba Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Tue, 22 Nov 2022 02:10:59 -0500 Subject: regmap-irq: Add handle_mask_sync() callback Provide a public callback handle_mask_sync() that drivers can use when they have more complex IRQ masking logic. The default implementation is regmap_irq_handle_mask_sync(), used if the chip doesn't provide its own callback. Cc: Mark Brown Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/e083474b3d467a86e6cb53da8072de4515bd6276.1669100542.git.william.gray@linaro.org Signed-off-by: Mark Brown --- include/linux/regmap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 5a56fc55ebd7..a3bc695bcca0 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1580,6 +1580,8 @@ struct regmap_irq_chip_data; * before regmap_irq_handler process the interrupts. * @handle_post_irq: Driver specific callback to handle interrupt from device * after handling the interrupts in regmap_irq_handler(). + * @handle_mask_sync: Callback used to handle IRQ mask syncs. The index will be + * in the range [0, num_regs) * @set_type_virt: Driver specific callback to extend regmap_irq_set_type() * and configure virt regs. Deprecated, use @set_type_config * callback and config registers instead. @@ -1641,6 +1643,9 @@ struct regmap_irq_chip { int (*handle_pre_irq)(void *irq_drv_data); int (*handle_post_irq)(void *irq_drv_data); + int (*handle_mask_sync)(struct regmap *map, int index, + unsigned int mask_buf_def, + unsigned int mask_buf, void *irq_drv_data); int (*set_type_virt)(unsigned int **buf, unsigned int type, unsigned long hwirq, int reg); int (*set_type_config)(unsigned int **buf, unsigned int type, -- cgit From 630dc25e43dacfb5af94cd41532e77c47ec1caff Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 5 Dec 2022 16:08:57 +0100 Subject: mm/swap: fix SWP_PFN_BITS with CONFIG_PHYS_ADDR_T_64BIT on 32bit We use "unsigned long" to store a PFN in the kernel and phys_addr_t to store a physical address. On a 64bit system, both are 64bit wide. However, on a 32bit system, the latter might be 64bit wide. This is, for example, the case on x86 with PAE: phys_addr_t and PTEs are 64bit wide, while "unsigned long" only spans 32bit. The current definition of SWP_PFN_BITS without MAX_PHYSMEM_BITS misses that case, and assumes that the maximum PFN is limited by an 32bit phys_addr_t. This implies, that SWP_PFN_BITS will currently only be able to cover 4 GiB - 1 on any 32bit system with 4k page size, which is wrong. Let's rely on the number of bits in phys_addr_t instead, but make sure to not exceed the maximum swap offset, to not make the BUILD_BUG_ON() in is_pfn_swap_entry() unhappy. Note that swp_entry_t is effectively an unsigned long and the maximum swap offset shares that value with the swap type. For example, on an 8 GiB x86 PAE system with a kernel config based on Debian 11.5 (-> CONFIG_FLATMEM=y, CONFIG_X86_PAE=y), we will currently fail removing migration entries (remove_migration_ptes()), because mm/page_vma_mapped.c:check_pte() will fail to identify a PFN match as swp_offset_pfn() wrongly masks off PFN bits. For example, split_huge_page_to_list()->...->remap_page() will leave migration entries in place and continue to unlock the page. Later, when we stumble over these migration entries (e.g., via /proc/self/pagemap), pfn_swap_entry_to_page() will BUG_ON() because these migration entries shouldn't exist anymore and the page was unlocked. [ 33.067591] kernel BUG at include/linux/swapops.h:497! [ 33.067597] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 33.067602] CPU: 3 PID: 742 Comm: cow Tainted: G E 6.1.0-rc8+ #16 [ 33.067605] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 [ 33.067606] EIP: pagemap_pmd_range+0x644/0x650 [ 33.067612] Code: 00 00 00 00 66 90 89 ce b9 00 f0 ff ff e9 ff fb ff ff 89 d8 31 db e8 48 c6 52 00 e9 23 fb ff ff e8 61 83 56 00 e9 b6 fe ff ff <0f> 0b bf 00 f0 ff ff e9 38 fa ff ff 3e 8d 74 26 00 55 89 e5 57 31 [ 33.067615] EAX: ee394000 EBX: 00000002 ECX: ee394000 EDX: 00000000 [ 33.067617] ESI: c1b0ded4 EDI: 00024a00 EBP: c1b0ddb4 ESP: c1b0dd68 [ 33.067619] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 EFLAGS: 00010246 [ 33.067624] CR0: 80050033 CR2: b7a00000 CR3: 01bbbd20 CR4: 00350ef0 [ 33.067625] Call Trace: [ 33.067628] ? madvise_free_pte_range+0x720/0x720 [ 33.067632] ? smaps_pte_range+0x4b0/0x4b0 [ 33.067634] walk_pgd_range+0x325/0x720 [ 33.067637] ? mt_find+0x1d6/0x3a0 [ 33.067641] ? mt_find+0x1d6/0x3a0 [ 33.067643] __walk_page_range+0x164/0x170 [ 33.067646] walk_page_range+0xf9/0x170 [ 33.067648] ? __kmem_cache_alloc_node+0x2a8/0x340 [ 33.067653] pagemap_read+0x124/0x280 [ 33.067658] ? default_llseek+0x101/0x160 [ 33.067662] ? smaps_account+0x1d0/0x1d0 [ 33.067664] vfs_read+0x90/0x290 [ 33.067667] ? do_madvise.part.0+0x24b/0x390 [ 33.067669] ? debug_smp_processor_id+0x12/0x20 [ 33.067673] ksys_pread64+0x58/0x90 [ 33.067675] __ia32_sys_ia32_pread64+0x1b/0x20 [ 33.067680] __do_fast_syscall_32+0x4c/0xc0 [ 33.067683] do_fast_syscall_32+0x29/0x60 [ 33.067686] do_SYSENTER_32+0x15/0x20 [ 33.067689] entry_SYSENTER_32+0x98/0xf1 Decrease the indentation level of SWP_PFN_BITS and SWP_PFN_MASK to keep it readable and consistent. [david@redhat.com: rely on sizeof(phys_addr_t) and min_t() instead] Link: https://lkml.kernel.org/r/20221206105737.69478-1-david@redhat.com [david@redhat.com: use "int" for comparison, as we're only comparing numbers < 64] Link: https://lkml.kernel.org/r/1f157500-2676-7cef-a84e-9224ed64e540@redhat.com Link: https://lkml.kernel.org/r/20221205150857.167583-1-david@redhat.com Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Yang Shi Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- include/linux/swapops.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 86b95ccb81bb..b07b277d6a16 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -33,11 +33,13 @@ * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS -#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ -#define SWP_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#define SWP_PFN_BITS min_t(int, \ + sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ + SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ -#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) +#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: -- cgit From 4a7ba45b1a435e7097ca0f79a847d0949d0eb088 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 Dec 2022 16:53:15 -1000 Subject: memcg: fix possible use-after-free in memcg_write_event_control() memcg_write_event_control() accesses the dentry->d_name of the specified control fd to route the write call. As a cgroup interface file can't be renamed, it's safe to access d_name as long as the specified file is a regular cgroup file. Also, as these cgroup interface files can't be removed before the directory, it's safe to access the parent too. Prior to 347c4a874710 ("memcg: remove cgroup_event->cft"), there was a call to __file_cft() which verified that the specified file is a regular cgroupfs file before further accesses. The cftype pointer returned from __file_cft() was no longer necessary and the commit inadvertently dropped the file type check with it allowing any file to slip through. With the invarients broken, the d_name and parent accesses can now race against renames and removals of arbitrary files and cause use-after-free's. Fix the bug by resurrecting the file type check in __file_cft(). Now that cgroupfs is implemented through kernfs, checking the file operations needs to go through a layer of indirection. Instead, let's check the superblock and dentry type. Link: https://lkml.kernel.org/r/Y5FRm/cfcKPGzWwl@slm.duckdns.org Fixes: 347c4a874710 ("memcg: remove cgroup_event->cft") Signed-off-by: Tejun Heo Reported-by: Jann Horn Acked-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Linus Torvalds Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: [3.14+] Signed-off-by: Andrew Morton --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44b59e2..2b7d077de7ef 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -68,6 +68,7 @@ struct css_task_iter { struct list_head iters_node; /* css_set->task_iters */ }; +extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; -- cgit From ce098da1497c6dee9589fce2c61d1910f4fcf0e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Dec 2022 22:02:59 -0800 Subject: skbuff: Introduce slab_build_skb() syzkaller reported: BUG: KASAN: slab-out-of-bounds in __build_skb_around+0x235/0x340 net/core/skbuff.c:294 Write of size 32 at addr ffff88802aa172c0 by task syz-executor413/5295 For bpf_prog_test_run_skb(), which uses a kmalloc()ed buffer passed to build_skb(). When build_skb() is passed a frag_size of 0, it means the buffer came from kmalloc. In these cases, ksize() is used to find its actual size, but since the allocation may not have been made to that size, actually perform the krealloc() call so that all the associated buffer size checking will be correctly notified (and use the "new" pointer so that compiler hinting works correctly). Split this logic out into a new interface, slab_build_skb(), but leave the original 0 checking for now to catch any stragglers. Reported-by: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Link: https://groups.google.com/g/syzkaller-bugs/c/UnIKxTtU5-0/m/-wbXinkgAQAJ Fixes: 38931d8989b5 ("mm: Make ksize() a reporting-only function") Cc: Pavel Begunkov Cc: pepsipu Cc: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Cc: Vlastimil Babka Cc: kasan-dev Cc: Andrii Nakryiko Cc: ast@kernel.org Cc: Daniel Borkmann Cc: Hao Luo Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: jolsa@kernel.org Cc: KP Singh Cc: martin.lau@linux.dev Cc: Stanislav Fomichev Cc: song@kernel.org Cc: Yonghong Song Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221208060256.give.994-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4e464a27adaf..4c8492401a10 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1255,6 +1255,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); +struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer -- cgit From 3ed157d0d73f15b6574424dd068ca5f5a8560562 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Fri, 4 Nov 2022 09:43:03 +0800 Subject: sunrpc: svc: Remove an unused static function svc_ungetu32() The svc_ungetu32 function is not used, you could remove it. Signed-off-by: Li zeming Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 0edd837d401d..ed722dd33b46 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -220,13 +220,6 @@ static inline __be32 svc_getu32(struct kvec *iov) return val; } -static inline void svc_ungetu32(struct kvec *iov) -{ - __be32 *vp = (__be32 *)iov->iov_base; - iov->iov_base = (void *)(vp - 1); - iov->iov_len += sizeof(*vp); -} - static inline void svc_putu32(struct kvec *iov, __be32 val) { __be32 *vp = iov->iov_base + iov->iov_len; -- cgit From 44df6f439a1790a5f602e3842879efa88f346672 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:47 -0800 Subject: NFSD: add delegation reaper to react to low memory condition The delegation reaper is called by nfsd memory shrinker's on the 'count' callback. It scans the client list and sends the courtesy CB_RECALL_ANY to the clients that hold delegations. To avoid flooding the clients with CB_RECALL_ANY requests, the delegation reaper sends only one CB_RECALL_ANY request to each client per 5 seconds. Signed-off-by: Dai Ngo [ cel: moved definition of RCA4_TYPE_MASK_RDATA_DLG ] Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 8d04b6a5964c..730003c4f4af 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -732,4 +732,17 @@ enum nfs4_setxattr_options { SETXATTR4_CREATE = 1, SETXATTR4_REPLACE = 2, }; + +enum { + RCA4_TYPE_MASK_RDATA_DLG = 0, + RCA4_TYPE_MASK_WDATA_DLG = 1, + RCA4_TYPE_MASK_DIR_DLG = 2, + RCA4_TYPE_MASK_FILE_LAYOUT = 3, + RCA4_TYPE_MASK_BLK_LAYOUT = 4, + RCA4_TYPE_MASK_OBJ_LAYOUT_MIN = 8, + RCA4_TYPE_MASK_OBJ_LAYOUT_MAX = 9, + RCA4_TYPE_MASK_OTHER_LAYOUT_MIN = 12, + RCA4_TYPE_MASK_OTHER_LAYOUT_MAX = 15, +}; + #endif -- cgit From bfd5a5e82d22da43afa0e2bb9fb72339aa79c6cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Dec 2022 10:21:52 +0000 Subject: tracing: Fix some checker warnings Fix some checker warnings in the trace code by adding __printf attributes to a number of trace functions and their declarations. Changes: ======== ver #2) - Dropped the fix for the unconditional tracing_max_lat_fops decl[1]. Link: https://lore.kernel.org/r/20221205180617.9b9d3971cbe06ee536603523@kernel.org/ [1] Link: https://lore.kernel.org/r/166992525941.1716618.13740663757583361463.stgit@warthog.procyon.org.uk/ # v1 Link: https://lkml.kernel.org/r/167023571258.382307.15314866482834835192.stgit@warthog.procyon.org.uk Signed-off-by: David Howells Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 3 ++- include/linux/trace_seq.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index bb2053246d6a..4342e996bcdb 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -234,7 +234,8 @@ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); -int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) + __printf(3, 4); struct event_filter; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 5a2c650d9e1c..0c4c7587d6c3 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -97,7 +97,8 @@ extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, const void *buf, size_t len, bool ascii); #else /* CONFIG_TRACING */ -static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +static inline __printf(2, 3) +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { } static inline void -- cgit From 5dd9cdbc9dec3e99b19e483767e247d15ca8cc0d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:29 +0200 Subject: bpf: states_equal() must build idmap for all function frames verifier.c:states_equal() must maintain register ID mapping across all function frames. Otherwise the following example might be erroneously marked as safe: main: fp[-24] = map_lookup_elem(...) ; frame[0].fp[-24].id == 1 fp[-32] = map_lookup_elem(...) ; frame[0].fp[-32].id == 2 r1 = &fp[-24] r2 = &fp[-32] call foo() r0 = 0 exit foo: 0: r9 = r1 1: r8 = r2 2: r7 = ktime_get_ns() 3: r6 = ktime_get_ns() 4: if (r6 > r7) goto skip_assign 5: r9 = r8 skip_assign: ; <--- checkpoint 6: r9 = *r9 ; (a) frame[1].r9.id == 2 ; (b) frame[1].r9.id == 1 7: if r9 == 0 goto exit: ; mark_ptr_or_null_regs() transfers != 0 info ; for all regs sharing ID: ; (a) r9 != 0 => &frame[0].fp[-32] != 0 ; (b) r9 != 0 => &frame[0].fp[-24] != 0 8: r8 = *r8 ; (a) r8 == &frame[0].fp[-32] ; (b) r8 == &frame[0].fp[-32] 9: r0 = *r8 ; (a) safe ; (b) unsafe exit: 10: exit While processing call to foo() verifier considers the following execution paths: (a) 0-10 (b) 0-4,6-10 (There is also path 0-7,10 but it is not interesting for the issue at hand. (a) is verified first.) Suppose that checkpoint is created at (6) when path (a) is verified, next path (b) is verified and (6) is reached. If states_equal() maintains separate 'idmap' for each frame the mapping at (6) for frame[1] would be empty and regsafe(r9)::check_ids() would add a pair 2->1 and return true, which is an error. If states_equal() maintains single 'idmap' for all frames the mapping at (6) would be { 1->1, 2->2 } and regsafe(r9)::check_ids() would return false when trying to add a pair 2->1. This issue was suggested in the following discussion: https://lore.kernel.org/bpf/CAEf4BzbFB5g4oUfyxk9rHy-PJSLQ3h8q9mV=rVoXfr_JVm8+1Q@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index df0cb825e0e3..53d175cbaa02 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -273,9 +273,9 @@ struct bpf_id_pair { u32 cur; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; -- cgit From 01744ce9f07f0b76b0b2d30adba2a7c104f1ff2a Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Mon, 5 Dec 2022 10:54:13 +0000 Subject: i3c: Correct the macro module_i3c_i2c_driver Present definition for module_i3c_i2c_driver uses only the 1st argument i.e., struct i3c_driver. Irrespective of CONFIG_I3C being enabled/disabled, struct i2c_driver is never passed to module_driver() Passing struct i2c_driver as the 4th argument works. Signed-off-by: Akshay Gupta Signed-off-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20221205105413.937704-1-naveenkrishna.chatradhi@amd.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 8242e13e7b0b..419192b5cc4d 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -287,7 +287,8 @@ static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, #define module_i3c_i2c_driver(__i3cdrv, __i2cdrv) \ module_driver(__i3cdrv, \ i3c_i2c_driver_register, \ - i3c_i2c_driver_unregister) + i3c_i2c_driver_unregister, \ + __i2cdrv) int i3c_device_do_priv_xfers(struct i3c_device *dev, struct i3c_priv_xfer *xfers, -- cgit From 672825cd2823a0cee4687ce80fef5b702ff3caa3 Mon Sep 17 00:00:00 2001 From: Jack Chen Date: Wed, 7 Dec 2022 15:50:59 -0500 Subject: i3c: export SETDASA method Because not all I3C drivers have the hot-join feature ready, and especially not all I3C devices support hot-join feature, exporting SETDASA method could be useful. With this function, the I3C controller could perform a DAA to I3C devices when users decide to turn these I3C devices off and on again during run-time. Tested: This change has been tested with turnning off an I3C device and turning on it again during run-time. The device driver calls SETDASA method to perform DAA to the device. And communication between I3C controller and device is set up again correctly. Signed-off-by: Jack Chen Link: https://lore.kernel.org/r/20221207205059.3848851-1-zenghuchen@google.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 419192b5cc4d..1c997abe868c 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -294,6 +294,8 @@ int i3c_device_do_priv_xfers(struct i3c_device *dev, struct i3c_priv_xfer *xfers, int nxfers); +int i3c_device_do_setdasa(struct i3c_device *dev); + void i3c_device_get_info(struct i3c_device *dev, struct i3c_device_info *info); struct i3c_ibi_payload { -- cgit From c31783eeae7b22dc3f6edde7339de6112959225d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:38 +0200 Subject: mm/pagewalk: don't trigger test_walk() in walk_page_vma() As Peter points out, the caller passes a single VMA and can just do that check itself. And in fact, no existing users rely on test_walk() getting called. So let's just remove it and make the implementation slightly more efficient. Link: https://lkml.kernel.org/r/20221021101141.84170-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f3fafb731ffd..37dc0208862d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -27,6 +27,8 @@ struct mm_walk; * "do page table walk over the current vma", returning * a negative value means "abort current page table walk * right now" and returning 1 means "skip the current vma" + * Note that this callback is not called when the caller + * passes in a single VMA as for walk_page_vma(). * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. -- cgit From cb8d863313436339fb60f7dd5131af2e5854621e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:35 +0200 Subject: mm: remove VM_FAULT_WRITE All users -- GUP and KSM -- are gone, let's just remove it. Link: https://lkml.kernel.org/r/20221021101141.84170-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 018b1c098173..199f98be6f9c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -929,7 +929,6 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage - * @VM_FAULT_WRITE: Special case for get_user_pages * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits @@ -950,7 +949,6 @@ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, - VM_FAULT_WRITE = (__force vm_fault_t)0x000008, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, @@ -976,7 +974,6 @@ enum vm_fault_reason { { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ - { VM_FAULT_WRITE, "WRITE" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ -- cgit From e07cda5f232fac4de0925d8a4c92e51e41fa2f6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:39 +0200 Subject: mm/pagewalk: add walk_page_range_vma() Let's add walk_page_range_vma(), which is similar to walk_page_vma(), however, is only interested in a subset of the VMA range. To be used in KSM code to stop using follow_page() next. Link: https://lkml.kernel.org/r/20221021101141.84170-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 37dc0208862d..959f52e5867d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -101,6 +101,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, -- cgit From f7355e99d9f71fcde093193fd4b569a648ba5ce3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:41 +0200 Subject: mm/gup: remove FOLL_MIGRATION Fortunately, the last user (KSM) is gone, so let's just remove this rather special code from generic GUP handling -- especially because KSM never required the PMD handling as KSM only deals with individual base pages. [akpm@linux-foundation.org: fix merge snafu]Link: https://lkml.kernel.org/r/20221021101141.84170-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8df5cae69c80..767c8c522e70 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,7 +3057,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_ANON 0x8000 /* don't do file mappings */ -- cgit From d3a89233583bf8edab18ac09732759c71dbe0173 Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Mon, 28 Nov 2022 21:07:43 +0800 Subject: include/linux/pgtable.h: : remove redundant pte variable Return value from ptep_get_and_clear_full() directly instead of taking this in another redundant variable. Link: https://lkml.kernel.org/r/202211282107437343474@zte.com.cn Signed-off-by: zhang songyi Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c74cce67eec8..dfabd549d2e7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,9 +425,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { - pte_t pte; - pte = ptep_get_and_clear(mm, address, ptep); - return pte; + return ptep_get_and_clear(mm, address, ptep); } #endif -- cgit From 7b09f5af01ede480cbe7abcb281cf17550a46ff5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:51 +0800 Subject: LoongArch: add sparse memory vmemmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sparse memory vmemmap support for LoongArch. SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. Link: https://lkml.kernel.org/r/20221027125253.3458989-3-chenhuacai@loongson.cn Signed-off-by: Min Zhou Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Peter Zijlstra Cc: Philippe Mathieu-Daudé Cc: Thomas Bogendoerfer Cc: Will Deacon Cc: Xuefeng Li Cc: Xuerui Wang Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 767c8c522e70..7c31f898337c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3360,6 +3360,8 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +void pmd_init(void *addr); +void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); -- cgit From 2045a3b8911b6ee64dd9b522d61abc468ecdcdb5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:52 +0800 Subject: mm/sparse-vmemmap: generalise vmemmap_populate_hugepages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalise vmemmap_populate_hugepages() so ARM64 & X86 & LoongArch can share its implementation. Link: https://lkml.kernel.org/r/20221027125253.3458989-4-chenhuacai@loongson.cn Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Acked-by: Will Deacon Acked-by: Dave Hansen Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Min Zhou Cc: Peter Zijlstra Cc: Philippe Mathieu-Daudé Cc: Thomas Bogendoerfer Cc: Xuefeng Li Cc: Xuerui Wang Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c31f898337c..472cb60ace07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3373,8 +3373,14 @@ struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next); +int vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); -- cgit From 3720dd6dcac38d03424d6ba38107f39af5318bcf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:22 -0700 Subject: filemap: convert replace_page_cache_page() to replace_page_cache_folio() Patch series "Removing the lru_cache_add() wrapper". This patchset replaces all calls of lru_cache_add() with the folio equivalent: folio_add_lru(). This is allows us to get rid of the wrapper The series passes xfstests and the userfaultfd selftests. This patch (of 5): Eliminates 7 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221101175326.13265-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2ec0ca1f3d38..29e1f9e76eb6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); void __filemap_remove_folio(struct folio *folio, void *shadow); -void replace_page_cache_page(struct page *old, struct page *new); +void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); bool filemap_release_folio(struct folio *folio, gfp_t gfp); -- cgit From 6e1ca48d0669b0f5efcbaa051b23cd8e651a1614 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:26 -0700 Subject: folio-compat: remove lru_cache_add() There are no longer any callers of lru_cache_add(), so remove it. This saves 79 bytes of kernel text. Also cleanup some comments such that they reference the new folio_add_lru() instead. Link: https://lkml.kernel.org/r/20221101175326.13265-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index b61e2007d156..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -381,7 +381,6 @@ void lru_note_cost(struct lruvec *lruvec, bool file, void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); -void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); -- cgit From 9fd330582b2fe43c49ebcd02b2480f051f85aad4 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:30 -0800 Subject: mm: add folio dtor and order setter functions Patch series "convert core hugetlb functions to folios", v5. ============== OVERVIEW =========================== Now that many hugetlb helper functions that deal with hugetlb specific flags[1] and hugetlb cgroups[2] are converted to folios, higher level allocation, prep, and freeing functions within hugetlb can also be converted to operate in folios. Patch 1 of this series implements the wrapper functions around setting the compound destructor and compound order for a folio. Besides the user added in patch 1, patch 2 and patch 9 also use these helper functions. Patches 2-10 convert the higher level hugetlb functions to folios. ============== TESTING =========================== LTP: Ran 10 back to back rounds of the LTP hugetlb test suite. Gigantic Huge Pages: Test allocation and freeing via hugeadm commands: hugeadm --pool-pages-min 1GB:10 hugeadm --pool-pages-min 1GB:0 Demote: Demote 1 1GB hugepages to 512 2MB hugepages echo 1 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages echo 1 > /sys/kernel/mm/hugepages/hugepages-1048576kB/demote cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # 512 cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages # 0 [1] https://lore.kernel.org/lkml/20220922154207.1575343-1-sidhartha.kumar@oracle.com/ [2] https://lore.kernel.org/linux-mm/20221101223059.460937-1-sidhartha.kumar@oracle.com/ This patch (of 10): Add folio equivalents for set_compound_order() and set_compound_page_dtor(). Also remove extra new-lines introduced by mm/hugetlb: convert move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert hugetlb_cgroup_uncharge_page() to folios. [sidhartha.kumar@oracle.com: clarify folio_set_compound_order() zero support] Link: https://lkml.kernel.org/r/20221207223731.32784-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Tarun Sahu Cc: Rasmus Villemoes Cc: Wei Chen Signed-off-by: Andrew Morton --- include/linux/mm.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 472cb60ace07..7dc376052d40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,6 +997,13 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } +static inline void folio_set_compound_dtor(struct folio *folio, + enum compound_dtor_id compound_dtor) +{ + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); + folio->_folio_dtor = compound_dtor; +} + void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) @@ -1012,6 +1019,22 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } +/* + * folio_set_compound_order is generally passed a non-zero order to + * initialize a large folio. However, hugetlb code abuses this by + * passing in zero when 'dissolving' a large folio. + */ +static inline void folio_set_compound_order(struct folio *folio, + unsigned int order) +{ + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { -- cgit From 169004265860327182ecf92297b25b6271e81e96 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:51 +0000 Subject: fsdax: introduce page->share for fsdax in reflink mode Patch series "fsdax,xfs: fix warning messages", v2. Many testcases failed in dax+reflink mode with warning message in dmesg. Such as generic/051,075,127. The warning message is like this: [ 775.509337] ------------[ cut here ]------------ [ 775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 dax_insert_entry.cold+0x2e/0x69 [ 775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel configfs xfs libcrc32c fuse [ 775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: G W 6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d [ 775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.0-3-3 04/01/2014 [ 775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69 [ 775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48 [ 775.526708] RSP: 0000:ffffc90001d57b30 EFLAGS: 00010082 [ 775.527042] RAX: 000000000000002a RBX: 0000000000000000 RCX: 0000000000000042 [ 775.527396] RDX: ffffea000a0f6c80 RSI: ffffffff81dfab1b RDI: 00000000ffffffff [ 775.527819] RBP: ffffea000a0f6c40 R08: 0000000000000000 R09: ffffffff820625e0 [ 775.528241] R10: ffffc90001d579d8 R11: ffffffff820d2628 R12: ffff88815fc98320 [ 775.528598] R13: ffffc90001d57c18 R14: 0000000000000000 R15: 0000000000000001 [ 775.528997] FS: 00007f39fc75d740(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 775.529474] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 775.529800] CR2: 00007f39fc772040 CR3: 0000000107eb6001 CR4: 00000000003706e0 [ 775.530214] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 775.530592] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 775.531002] Call Trace: [ 775.531230] [ 775.531444] dax_fault_iter+0x267/0x6c0 [ 775.531719] dax_iomap_pte_fault+0x198/0x3d0 [ 775.532002] __xfs_filemap_fault+0x24a/0x2d0 [xfs aa8d25411432b306d9554da38096f4ebb86bdfe7] [ 775.532603] __do_fault+0x30/0x1e0 [ 775.532903] do_fault+0x314/0x6c0 [ 775.533166] __handle_mm_fault+0x646/0x1250 [ 775.533480] handle_mm_fault+0xc1/0x230 [ 775.533810] do_user_addr_fault+0x1ac/0x610 [ 775.534110] exc_page_fault+0x63/0x140 [ 775.534389] asm_exc_page_fault+0x22/0x30 [ 775.534678] RIP: 0033:0x7f39fc55820a [ 775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9 a4 c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00 [ 775.535839] RSP: 002b:00007ffc66a08118 EFLAGS: 00010202 [ 775.536157] RAX: 00007f39fc772001 RBX: 0000000000042001 RCX: 00000000000063c1 [ 775.536537] RDX: 0000000000006400 RSI: 00007f39fac42050 RDI: 00007f39fc772040 [ 775.536919] RBP: 0000000000006400 R08: 00007f39fc772001 R09: 0000000000042000 [ 775.537304] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 [ 775.537694] R13: 00007f39fc772000 R14: 0000000000006401 R15: 0000000000000003 [ 775.538086] [ 775.538333] ---[ end trace 0000000000000000 ]--- This also affects dax+noreflink mode if we run the test after a dax+reflink test. So, the most urgent thing is solving the warning messages. With these fixes, most warning messages in dax_associate_entry() are gone. But honestly, generic/388 will randomly failed with the warning. The case shutdown the xfs when fsstress is running, and do it for many times. I think the reason is that dax pages in use are not able to be invalidated in time when fs is shutdown. The next time dax page to be associated, it still remains the mapping value set last time. I'll keep on solving it. The warning message in dax_writeback_one() can also be fixed because of the dax unshare. This patch (of 8): fsdax page is used not only when CoW, but also mapread. To make the it easily understood, use 'share' to indicate that the dax page is shared by more than one extent. And add helper functions to use it. Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED. [ruansy.fnst@fujitsu.com: rename several functions] Link: https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.fnst@fujitsu.com [ruansy.fnst@fujitsu.com: v2.2] Link: https://lkml.kernel.org/r/1670381359-53-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-2-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Alistair Popple Cc: John Hubbard Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 ++++- include/linux/page-flags.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199f98be6f9c..3b8475007734 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -104,7 +104,10 @@ struct page { }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; - pgoff_t index; /* Our offset within mapping. */ + union { + pgoff_t index; /* Our offset within mapping. */ + unsigned long share; /* share count for fsdax */ + }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e42c55a7e012..9aec9fd8c50b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -638,7 +638,7 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ -#define PAGE_MAPPING_DAX_COW 0x1 +#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(struct folio *folio) { -- cgit From d984648e428bf88cbd94ebe346c73632cb92fffb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:33 +0000 Subject: fsdax,xfs: port unshare to fsdax Implement unshare in fsdax mode: copy data from srcmap to iomap. Link: https://lkml.kernel.org/r/1669908753-169-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- include/linux/dax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index ba985333e26b..2b5ecb591059 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -205,6 +205,8 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping, } #endif +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, -- cgit From adb8213014b25c7f1d75d5b219becaadcd695efb Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 2 Dec 2022 03:15:10 +0000 Subject: mm: memcg: fix stale protection of reclaim target memcg Patch series "mm: memcg: fix protection of reclaim target memcg", v3. This series fixes a bug in calculating the protection of the reclaim target memcg where we end up using stale effective protection values from the last reclaim operation, instead of completely ignoring the protection of the reclaim target as intended. More detailed explanation and examples in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest case that catches the bug. This patch (of 3): When we are doing memcg reclaim, the intended behavior is that we ignore any protection (memory.min, memory.low) of the target memcg (but not its children). Ever since the patch pointed to by the "Fixes" tag, we actually read a stale value for the target memcg protection when deciding whether to skip the memcg or not because it is protected. If the stale value happens to be high enough, we don't reclaim from the target memcg. Essentially, in some cases we may falsely skip reclaiming from the target memcg of reclaim because we read a stale protection value from last time we reclaimed from it. During reclaim, mem_cgroup_calculate_protection() is used to determine the effective protection (emin and elow) values of a memcg. The protection of the reclaim target is ignored, but we cannot set their effective protection to 0 due to a limitation of the current implementation (see comment in mem_cgroup_protection()). Instead, we leave their effective protection values unchaged, and later ignore it in mem_cgroup_protection(). However, mem_cgroup_protection() is called later in shrink_lruvec()->get_scan_count(), which is after the mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result, the stale effective protection values of the target memcg may lead us to skip reclaiming from the target memcg entirely, before calling shrink_lruvec(). This can be even worse with recursive protection, where the stale target memcg protection can be higher than its standalone protection. See two examples below (a similar version of example (a) is added to test_memcontrol in a later patch). (a) A simple example with proactive reclaim is as follows. Consider the following hierarchy: ROOT | A | B (memory.min = 10M) Consider the following scenario: - B has memory.current = 10M. - The system undergoes global reclaim (or memcg reclaim in A). - In shrink_node_memcgs(): - mem_cgroup_calculate_protection() calculates the effective min (emin) of B as 10M. - mem_cgroup_below_min() returns true for B, we do not reclaim from B. - Now if we want to reclaim 5M from B using proactive reclaim (memory.reclaim), we should be able to, as the protection of the target memcg should be ignored. - In shrink_node_memcgs(): - mem_cgroup_calculate_protection() immediately returns for B without doing anything, as B is the target memcg, relying on mem_cgroup_protection() to ignore B's stale effective min (still 10M). - mem_cgroup_below_min() reads the stale effective min for B and we skip it instead of ignoring its protection as intended, as we never reach mem_cgroup_protection(). (b) An more complex example with recursive protection is as follows. Consider the following hierarchy with memory_recursiveprot: ROOT | A (memory.min = 50M) | B (memory.min = 10M, memory.high = 40M) Consider the following scenario: - B has memory.current = 35M. - The system undergoes global reclaim (target memcg is NULL). - B will have an effective min of 50M (all of A's unclaimed protection). - B will not be reclaimed from. - Now allocate 10M more memory in B, pushing it above it's high limit. - The system undergoes memcg reclaim from B (target memcg is B). - Like example (a), we do nothing in mem_cgroup_calculate_protection(), then call mem_cgroup_below_min(), which will read the stale effective min for B (50M) and skip it. In this case, it's even worse because we are not just considering B's standalone protection (10M), but we are reading a much higher stale protection (50M) which will cause us to not reclaim from B at all. This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") which made mem_cgroup_calculate_protection() only change the state without returning any value. Before that commit, we used to return MEMCG_PROT_NONE for the target memcg, which would cause us to skip the mem_cgroup_below_{min/low}() checks. After that commit we do not return anything and we end up checking the min & low effective protections for the target memcg, which are stale. Update mem_cgroup_supports_protection() to also check if we are reclaiming from the target, and rename it to mem_cgroup_unprotected() (now returns true if we should not protect the memcg, much simpler logic). Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") Signed-off-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Chris Down Cc: David Rientjes Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e1644a24009c..d3c8203cab6c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -615,28 +615,32 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); -static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support - * protection. + * protection. The target memcg's protection is ignored, see + * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg); - + return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || + memcg == target; } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= @@ -1209,12 +1213,19 @@ static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, { } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) +{ + return true; +} +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } -- cgit From 12a5d3955227b0d7e04fb793ccceeb2a1dd275c5 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 2 Dec 2022 14:35:31 -0800 Subject: mm: add nodes= arg to memory.reclaim The nodes= arg instructs the kernel to only scan the given nodes for proactive reclaim. For example use cases, consider a 2 tier memory system: nodes 0,1 -> top tier nodes 2,3 -> second tier $ echo "1m nodes=0" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory from node 0. Since node 0 is a top tier node, demotion will be attempted first. This is useful to direct proactive reclaim to specific nodes that are under pressure. $ echo "1m nodes=2,3" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory in the second tier, since this tier of memory has no demotion targets the memory will be reclaimed. $ echo "1m nodes=0,1" > memory.reclaim Instructs the kernel to reclaim memory from the top tier nodes, which can be desirable according to the userspace policy if there is pressure on the top tiers. Since these nodes have demotion targets, the kernel will attempt demotion first. Since commit 3f1509c57b1b ("Revert "mm/vmscan: never demote for memcg reclaim""), the proactive reclaim interface memory.reclaim does both reclaim and demotion. Reclaim and demotion incur different latency costs to the jobs in the cgroup. Demoted memory would still be addressable by the userspace at a higher latency, but reclaimed memory would need to incur a pagefault. The 'nodes' arg is useful to allow the userspace to control demotion and reclaim independently according to its policy: if the memory.reclaim is called on a node with demotion targets, it will attempt demotion first; if it is called on a node without demotion targets, it will only attempt reclaim. Link: https://lkml.kernel.org/r/20221202223533.1785418-1-almasrymina@google.com Signed-off-by: Mina Almasry Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Muchun Song Cc: Bagas Sanjaya Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Xu Cc: Yang Shi Cc: Yosry Ahmed Cc: zefan li Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ceed49516ad..2787b84eaf12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + nodemask_t *nodemask); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, -- cgit From c7cdf94e9cd7a03549e61b0f85949959191b8a10 Mon Sep 17 00:00:00 2001 From: Wang Yong Date: Wed, 7 Dec 2022 07:40:11 +0000 Subject: mm: fix typo in struct pglist_data code comment change "stat" to "start". Link: https://lkml.kernel.org/r/20221207074011.GA151242@cloud Fixes: c959924b0dc5 ("memory tiering: adjust hot threshold automatically") Signed-off-by: Wang Yong Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..128f3cde800c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,7 +1200,7 @@ typedef struct pglist_data { /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* - * number of promote candidate pages at stat time of current promote + * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; -- cgit From fd4e60bf0ef8eb9edcfa12dda39e8b6ee9060492 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Thu, 24 Nov 2022 22:01:54 +0800 Subject: eventfd: change int to __u64 in eventfd_signal() ifndef CONFIG_EVENTFD Commit ee62c6b2dc93 ("eventfd: change int to __u64 in eventfd_signal()") forgot to change int to __u64 in the CONFIG_EVENTFD=n stub function. Link: https://lkml.kernel.org/r/20221124140154.104680-1-zhangqilong3@huawei.com Fixes: ee62c6b2dc93 ("eventfd: change int to __u64 in eventfd_signal()") Signed-off-by: Zhang Qilong Cc: Dylan Yudaken Cc: Jens Axboe Cc: Sha Zhengju Signed-off-by: Andrew Morton --- include/linux/eventfd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 30eb30d6909b..3cd202d3eefb 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -61,7 +61,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx, int n) +static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) { return -ENOSYS; } -- cgit From eca36e43ee63114468e41a778b33b6886b9c5f6a Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 27 Nov 2022 17:07:32 +0100 Subject: io-mapping: move some code within the include guarded section It is spurious to have some code out-side the include guard in a .h file. Fix it. Link: https://lkml.kernel.org/r/4dbaf427d4300edba6c6bbfaf4d57493b9bec6ee.1669565241.git.christophe.jaillet@wanadoo.fr Fixes: 1fbaf8fc12a0 ("mm: add a io_mapping_map_user helper") Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton --- include/linux/io-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 66a774d2710e..09d4f17c8d3b 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -213,7 +213,7 @@ io_mapping_free(struct io_mapping *iomap) kfree(iomap); } -#endif /* _LINUX_IO_MAPPING_H */ - int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size); + +#endif /* _LINUX_IO_MAPPING_H */ -- cgit From 204c2f535d05d52bd7334629557087f9983e6879 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 3 Dec 2022 18:25:21 +0800 Subject: kcov: fix spelling typos in comments Fix the typo of 'suport' in kcov.h Link: https://lkml.kernel.org/r/tencent_922CA94B789587D79FD154445D035AA19E07@qq.com Signed-off-by: Rong Tao Reviewed-by: Dmitry Vyukov Cc: Andrey Konovalov Signed-off-by: Andrew Morton --- include/linux/kcov.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kcov.h b/include/linux/kcov.h index 55dc338f6bcd..ee04256f28af 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -56,7 +56,7 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary * work around for kcov's lack of nested remote coverage sections support in - * task context. Adding suport for nested sections is tracked in: + * task context. Adding support for nested sections is tracked in: * https://bugzilla.kernel.org/show_bug.cgi?id=210337 */ -- cgit From 860b7f27b8f78564ca5a2f607e0820b2d352a562 Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:57 +0200 Subject: linux/virtio_net.h: Support USO offload in vnet header. Now, it's possible to convert USO vnet packets from/to skb. Added support for GSO_UDP_L4 offload. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index a960de68ac69..bdf8de2cdd93 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -15,6 +15,7 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: + case VIRTIO_NET_HDR_GSO_UDP_L4: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: @@ -31,6 +32,7 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: + case VIRTIO_NET_HDR_GSO_UDP_L4: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: @@ -69,6 +71,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; + case VIRTIO_NET_HDR_GSO_UDP_L4: + gso_type = SKB_GSO_UDP_L4; + ip_proto = IPPROTO_UDP; + thlen = sizeof(struct udphdr); + break; default: return -EINVAL; } @@ -182,6 +189,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP_L4) + hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) -- cgit From 983055bf839745ec2812fc55cacd96888aa0d7a6 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 1 Dec 2022 02:46:54 +0900 Subject: USB: core: export usb_cache_string() usb_cache_string() can also be useful for the drivers so export it. Signed-off-by: Vincent Mailhol Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20221130174658.29282-4-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde --- include/linux/usb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 9ff1ad4dfad1..d2d2f41052c0 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1829,6 +1829,7 @@ static inline int usb_get_ptm_status(struct usb_device *dev, void *data) extern int usb_string(struct usb_device *dev, int index, char *buf, size_t size); +extern char *usb_cache_string(struct usb_device *udev, int index); /* wrappers that also update important state inside usbcore */ extern int usb_clear_halt(struct usb_device *dev, int pipe); -- cgit From 8a321cf7becc6c065ae595b837b826a2a81036b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 9 Dec 2022 10:21:38 -0500 Subject: net: add IFF_NO_ADDRCONF and use it in bonding to prevent ipv6 addrconf Currently, in bonding it reused the IFF_SLAVE flag and checked it in ipv6 addrconf to prevent ipv6 addrconf. However, it is not a proper flag to use for no ipv6 addrconf, for bonding it has to move IFF_SLAVE flag setting ahead of dev_open() in bond_enslave(). Also, IFF_MASTER/SLAVE are historical flags used in bonding and eql, as Jiri mentioned, the new devices like Team, Failover do not use this flag. So as Jiri suggested, this patch adds IFF_NO_ADDRCONF in priv_flags of the device to indicate no ipv6 addconf, and uses it in bonding and moves IFF_SLAVE flag setting back to its original place. Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2287cb8eb9e4..aad12a179e54 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1662,6 +1662,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1697,7 +1698,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - /* was IFF_LIVE_RENAME_OK */ + IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; -- cgit From 2c05bf3aa0741f4f3c72432db7801371dbbcf289 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 13 Dec 2022 12:28:51 +0100 Subject: mnt_idmapping: move ima-only helpers to ima The vfs{g,u}id_{gt,lt}_* helpers are currently not needed outside of ima and we shouldn't incentivize people to use them by placing them into the header. Let's just define them locally in the one file in ima where they are used. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner (Microsoft) --- include/linux/mnt_idmapping.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 092c52aa6c2c..0ccca33a7a6d 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -96,26 +96,6 @@ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid); } -static inline bool vfsuid_gt_kuid(vfsuid_t vfsuid, kuid_t kuid) -{ - return __vfsuid_val(vfsuid) > __kuid_val(kuid); -} - -static inline bool vfsgid_gt_kgid(vfsgid_t vfsgid, kgid_t kgid) -{ - return __vfsgid_val(vfsgid) > __kgid_val(kgid); -} - -static inline bool vfsuid_lt_kuid(vfsuid_t vfsuid, kuid_t kuid) -{ - return __vfsuid_val(vfsuid) < __kuid_val(kuid); -} - -static inline bool vfsgid_lt_kgid(vfsgid_t vfsgid, kgid_t kgid) -{ - return __vfsgid_val(vfsgid) < __kgid_val(kgid); -} - /* * vfs{g,u}ids are created from k{g,u}ids. * We don't allow them to be created from regular {u,g}id. -- cgit From af80602799681c78f14fbe20b6185a56020dedee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Oct 2022 21:38:18 +0200 Subject: mm: Move mm_cachep initialization to mm_init() In order to allow using mm_alloc() much earlier, move initializing mm_cachep into mm_init(). Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221025201057.751153381@infradead.org --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index d6c48163c6de..8431558641a4 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -65,6 +65,7 @@ extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); void __noreturn make_task_dead(int signr); +extern void mm_cache_init(void); extern void proc_caches_init(void); extern void fork_init(void); -- cgit From 3f4c8211d982099be693be9aa7d6fc4607dff290 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Oct 2022 21:38:21 +0200 Subject: x86/mm: Use mm_alloc() in poking_init() Instead of duplicating init_mm, allocate a fresh mm. The advantage is that mm_alloc() has much simpler dependencies. Additionally it makes more conceptual sense, init_mm has no (and must not have) user state to duplicate. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221025201057.816175235@infradead.org --- include/linux/sched/task.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 8431558641a4..357e0068497c 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -91,7 +91,6 @@ extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); -struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); -- cgit From d48567c9a0d1e605639f8a8705a61bbb55fb4e84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Oct 2022 12:13:03 +0200 Subject: mm: Introduce set_memory_rox() Because endlessly repeating: set_memory_ro() set_memory_x() is getting tedious. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y1jek64pXOsougmz@hirez.programming.kicks-ass.net --- include/linux/filter.h | 3 +-- include/linux/set_memory.h | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index efc42a6e3aed..f0b17aff4e66 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -860,8 +860,7 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); - set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 369769ce7399..023ebc67a36c 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -14,6 +14,14 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +static inline int set_memory_rox(unsigned long addr, int numpages) +{ + int ret = set_memory_ro(addr, numpages); + if (ret) + return ret; + return set_memory_x(addr, numpages); +} + #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP static inline int set_direct_map_invalid_noflush(struct page *page) { -- cgit From 60463628c9e0a8060ac6bef0457b0505c7532c7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Oct 2022 13:19:31 +0200 Subject: x86/mm: Implement native set_memory_rox() Provide a native implementation of set_memory_rox(), avoiding the double set_memory_ro();set_memory_x(); calls. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) --- include/linux/set_memory.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 023ebc67a36c..95ac8398ee72 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -14,6 +14,7 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +#ifndef set_memory_rox static inline int set_memory_rox(unsigned long addr, int numpages) { int ret = set_memory_ro(addr, numpages); @@ -21,6 +22,7 @@ static inline int set_memory_rox(unsigned long addr, int numpages) return ret; return set_memory_x(addr, numpages); } +#endif #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP static inline int set_direct_map_invalid_noflush(struct page *page) -- cgit From 93b3037a1482758349f3b0431406bcc457ca1cbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Nov 2020 14:04:46 +0100 Subject: mm: Update ptep_get_lockless()'s comment Improve the comment. Suggested-by: Matthew Wilcox Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.515572025%40infradead.org --- include/linux/pgtable.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a108b60a6962..c0b29000c3c0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -300,15 +300,12 @@ static inline pte_t ptep_get(pte_t *ptep) #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking any - * locks. For this we would like to load the pointers atomically, but sometimes - * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What - * we do have is the guarantee that a PTE will only either go from not present - * to present, or present to not present or both -- it will not switch to a - * completely different present page without a TLB flush in between; something - * that we are blocking by holding interrupts off. + * For walking the pagetables without holding any locks. Some architectures + * (eg x86-32 PAE) cannot load the entries atomically without using expensive + * instructions. We are guaranteed that a PTE will only either go from not + * present to present, or present to not present -- it will not switch to a + * completely different present page without a TLB flush inbetween; which we + * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * -- cgit From 024d232ae4fcd7a7ce8ea239607d6c1246d7adc8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Nov 2020 17:16:22 +0100 Subject: mm: Fix pmd_read_atomic() AFAICT there's no reason to do anything different than what we do for PTEs. Make it so (also affects SH). Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.711181252%40infradead.org --- include/linux/pgtable.h | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c0b29000c3c0..765fd4bf420f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -298,6 +298,13 @@ static inline pte_t ptep_get(pte_t *ptep) } #endif +#ifndef __HAVE_ARCH_PMDP_GET +static inline pmd_t pmdp_get(pmd_t *pmdp) +{ + return READ_ONCE(*pmdp); +} +#endif + #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures @@ -340,15 +347,42 @@ static inline pte_t ptep_get_lockless(pte_t *ptep) return pte; } -#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +#define ptep_get_lockless ptep_get_lockless + +#if CONFIG_PGTABLE_LEVELS > 2 +static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) +{ + pmd_t pmd; + + do { + pmd.pmd_low = pmdp->pmd_low; + smp_rmb(); + pmd.pmd_high = pmdp->pmd_high; + smp_rmb(); + } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); + + return pmd; +} +#define pmdp_get_lockless pmdp_get_lockless +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ + /* * We require that the PTE can be read atomically. */ +#ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +#endif + +#ifndef pmdp_get_lockless +static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) +{ + return pmdp_get(pmdp); +} +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR @@ -1318,17 +1352,10 @@ static inline int pud_trans_unstable(pud_t *pud) #endif } -#ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { - /* - * Depend on compiler for an atomic pmd read. NOTE: this is - * only going to work, if the pmdval_t isn't larger than - * an unsigned long. - */ - return *pmdp; + return pmdp_get_lockless(pmdp); } -#endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) -- cgit From 6ca297d4784625de7b041e8451780643cf5751a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Oct 2022 14:51:44 +0200 Subject: mm: Rename GUP_GET_PTE_LOW_HIGH Since it no longer applies to only PTEs, rename it to PXX. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.776404066%40infradead.org --- include/linux/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 765fd4bf420f..7dd3df742543 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -305,7 +305,7 @@ static inline pmd_t pmdp_get(pmd_t *pmdp) } #endif -#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH +#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive @@ -365,7 +365,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) } #define pmdp_get_lockless pmdp_get_lockless #endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. -- cgit From dab6e717429e5ec795d558a0e9a5337a1ed33a3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Nov 2020 17:20:28 +0100 Subject: mm: Rename pmd_read_atomic() There's no point in having the identical routines for PTE/PMD have different names. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.841277397%40infradead.org --- include/linux/pgtable.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7dd3df742543..23348528ff36 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1352,11 +1352,6 @@ static inline int pud_trans_unstable(pud_t *pud) #endif } -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - return pmdp_get_lockless(pmdp); -} - #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif @@ -1383,13 +1378,13 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) */ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); /* * The barrier will stabilize the pmdval in a register or on * the stack so that it will stop changing under the code. * * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, - * pmd_read_atomic is allowed to return a not atomic pmdval + * pmdp_get_lockless is allowed to return a not atomic pmdval * (for example pointing to an hugepage that has never been * mapped in the pmd). The below checks will only care about * the low part of the pmd with 32bit PAE x86 anyway, with the -- cgit From 2dff2c359e829245bc3d80e42e296876d1f0cf8e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Nov 2022 12:53:18 +0100 Subject: mm: Convert __HAVE_ARCH_P..P_GET to the new style Since __HAVE_ARCH_* style guards have been depricated in favour of defining the function name onto itself, convert pxxp_get(). Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y2EUEBlQXNgaJgoI@hirez.programming.kicks-ass.net --- include/linux/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 23348528ff36..70e2a7e06a76 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -291,14 +291,14 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, ptep_get_and_clear(mm, addr, ptep); } -#ifndef __HAVE_ARCH_PTEP_GET +#ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif -#ifndef __HAVE_ARCH_PMDP_GET +#ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); -- cgit From 53eab8e76667b124615a943a033cdf97c80c242a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Dec 2022 08:20:31 -0700 Subject: block: don't clear REQ_ALLOC_CACHE for non-polled requests Since commit: b99182c501c3 ("bio: add pcpu caching for non-polling bio_put") we support bio caching for IRQ based IO as well, hence there's no need to manually clear REQ_ALLOC_CACHE if we disable polling on a request. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b231a665682a..22078a28d7cb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -782,8 +782,7 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) static inline void bio_clear_polled(struct bio *bio) { - /* can't support alloc cache if we turn off polling */ - bio->bi_opf &= ~(REQ_POLLED | REQ_ALLOC_CACHE); + bio->bi_opf &= ~REQ_POLLED; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, -- cgit From 6bb20c152b6bf7dd8ffb248f33c2593fd9aeb318 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 29 Oct 2022 01:42:02 +0200 Subject: random: do not include from random.h The header is a random.c private detail, not something to be called by other code. As such, don't make it automatically available by way of random.h. Cc: Michael Ellerman Acked-by: Heiko Carstens Reviewed-by: Christophe Leroy Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 4a2a1de423cd..b0a940af4fff 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -152,8 +152,6 @@ declare_get_random_var_wait(long, unsigned long) */ #include -#include - #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); -- cgit From 3c202d14a9d73fb63c3dccb18feac5618c21e1c4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 9 Oct 2022 20:45:07 -0600 Subject: prandom: remove prandom_u32_max() Convert the final two users of prandom_u32_max() that slipped in during 6.2-rc1 to use get_random_u32_below(). Then, with no more users left, we can finally remove the deprecated function. Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index c94c02ba065c..f2ed5b72b3d6 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -24,12 +24,6 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); #define prandom_init_once(pcpu_state) \ DO_ONCE(prandom_seed_full_state, (pcpu_state)) -/* Deprecated: use get_random_u32_below() instead. */ -static inline u32 prandom_u32_max(u32 ep_ro) -{ - return get_random_u32_below(ep_ro); -} - /* * Handle minimum values for seeds */ -- cgit From 685e6311637e46f3212439ce2789f8a300e5050f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 10:30:45 +0100 Subject: nvme: fix the NVME_CMD_EFFECTS_CSE_MASK definition 3 << 16 does not generate the correct mask for bits 16, 17 and 18. Use the GENMASK macro to generate the correct mask instead. Fixes: 84fef62d135b ("nvme: check admin passthru command effects") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- include/linux/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d6be2a686100..d1cd53f2b6ab 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -7,6 +7,7 @@ #ifndef _LINUX_NVME_H #define _LINUX_NVME_H +#include #include #include @@ -639,7 +640,7 @@ enum { NVME_CMD_EFFECTS_NCC = 1 << 2, NVME_CMD_EFFECTS_NIC = 1 << 3, NVME_CMD_EFFECTS_CCC = 1 << 4, - NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, + NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, }; -- cgit From 6f99ac04c469b5d0a180a4ccea99d25d5dc9d21c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Dec 2022 16:13:38 +0100 Subject: nvme: consult the CSE log page for unprivileged passthrough Commands like Write Zeros can change the contents of a namespaces without actually transferring data. To protect against this, check the Commands Supported and Effects log is supported by the controller for any unprivileg command passthrough and refuse unprivileged passthrough if the command has any effects that can change data or metadata. Note: While the Commands Support and Effects log page has only been mandatory since NVMe 2.0, it is widely supported because Windows requires it for any command passthrough from userspace. Fixes: e4fbcf32c860 ("nvme: identify-namespace without CAP_SYS_ADMIN") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d1cd53f2b6ab..4fad4aa245fb 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -642,6 +642,7 @@ enum { NVME_CMD_EFFECTS_CCC = 1 << 4, NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, + NVME_CMD_EFFECTS_SCOPE_MASK = GENMASK(31, 20), }; struct nvme_effects_log { -- cgit