From 1470afefc3c42df5d1662f87d079b46651bdc95b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:02 -0700 Subject: cpumask: introduce for_each_cpu_or Equivalent of for_each_cpu_and, except it ORs the two masks together so it iterates all the CPUs present in either mask. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/cpumask.h | 17 +++++++++++++++++ include/linux/find.h | 37 +++++++++++++++++++++++++++++++++++++ lib/find_bit.c | 9 +++++++++ 3 files changed, 63 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 8fbe76607965..220974ef1bf5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -350,6 +350,23 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta #define for_each_cpu_andnot(cpu, mask1, mask2) \ for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) +/** + * for_each_cpu_or - iterate over every cpu present in either mask + * @cpu: the (optionally unsigned) integer iterator + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer + * + * This saves a temporary CPU mask in many places. It is equivalent to: + * struct cpumask tmp; + * cpumask_or(&tmp, &mask1, &mask2); + * for_each_cpu(cpu, &tmp) + * ... + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_or(cpu, mask1, mask2) \ + for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) + /** * cpumask_any_but - return a "random" in a cpumask, but not this one. * @mask: the cpumask to search diff --git a/include/linux/find.h b/include/linux/find.h index 4647864a5ffd..5e4f39ef2e72 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -14,6 +14,8 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); @@ -127,6 +129,36 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1, } #endif +#ifndef find_next_or_bit +/** + * find_next_or_bit - find the next set bit in either memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_or_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = (*addr1 | *addr2) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_or_bit(addr1, addr2, size, offset); +} +#endif + #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region @@ -536,6 +568,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) +#define for_each_or_bit(bit, addr1, addr2, size) \ + for ((bit) = 0; \ + (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ + (bit)++) + /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) diff --git a/lib/find_bit.c b/lib/find_bit.c index c10920e66788..32f99e9a670e 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -182,6 +182,15 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l EXPORT_SYMBOL(_find_next_andnot_bit); #endif +#ifndef find_next_or_bit +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_or_bit); +#endif + #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) -- cgit From 8b57b11cca88f397035a95b9e12b03511847b0e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:02 -0700 Subject: pcpcntrs: fix dying cpu summation race In commit f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") a race condition between a cpu dying and percpu_counter_sum() iterating online CPUs was identified. The solution was to iterate all possible CPUs for summation via percpu_counter_sum_all(). We recently had a percpu_counter_sum() call in XFS trip over this same race condition and it fired a debug assert because the filesystem was unmounting and the counter *should* be zero just before we destroy it. That was reported here: https://lore.kernel.org/linux-kernel/20230314090649.326642-1-yebin@huaweicloud.com/ likely as a result of running generic/648 which exercises filesystems in the presence of CPU online/offline events. The solution to use percpu_counter_sum_all() is an awful one. We use percpu counters and percpu_counter_sum() for accurate and reliable threshold detection for space management, so a summation race condition during these operations can result in overcommit of available space and that may result in filesystem shutdowns. As percpu_counter_sum_all() iterates all possible CPUs rather than just those online or even those present, the mask can include CPUs that aren't even installed in the machine, or in the case of machines that can hot-plug CPU capable nodes, even have physical sockets present in the machine. Fundamentally, this race condition is caused by the CPU being offlined being removed from the cpu_online_mask before the notifier that cleans up per-cpu state is run. Hence percpu_counter_sum() will not sum the count for a cpu currently being taken offline, regardless of whether the notifier has run or not. This is the root cause of the bug. The percpu counter notifier iterates all the registered counters, locks the counter and moves the percpu count to the global sum. This is serialised against other operations that move the percpu counter to the global sum as well as percpu_counter_sum() operations that sum the percpu counts while holding the counter lock. Hence the notifier is safe to run concurrently with sum operations, and the only thing we actually need to care about is that percpu_counter_sum() iterates dying CPUs. That's trivial to do, and when there are no CPUs dying, it has no addition overhead except for a cpumask_or() operation. This change makes percpu_counter_sum() always do the right thing in the presence of CPU hot unplug events and makes percpu_counter_sum_all() unnecessary. This, in turn, means that filesystems like XFS, ext4, and btrfs don't have to work out when they should use percpu_counter_sum() vs percpu_counter_sum_all() in their space accounting algorithms Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- lib/percpu_counter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index dba56c5c1837..0e096311e0c0 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -131,7 +131,7 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_cpu(cpu, cpu_mask) { + for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } @@ -141,11 +141,20 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, /* * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() + * but much slower version of percpu_counter_read_positive(). + * + * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums + * from CPUs that are in the process of being taken offline. Dying cpus have + * been removed from the online mask, but may not have had the hotplug dead + * notifier called to fold the percpu count back into the global counter sum. + * By including dying CPUs in the iteration mask, we avoid this race condition + * so __percpu_counter_sum() just does the right thing when CPUs are being taken + * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { - return __percpu_counter_sum_mask(fbc, cpu_online_mask); + + return __percpu_counter_sum_mask(fbc, cpu_dying_mask); } EXPORT_SYMBOL(__percpu_counter_sum); -- cgit From 7ba85fba47bd89618fdb7dc322bdf823b1b56efb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:03 -0700 Subject: fork: remove use of percpu_counter_sum_all This effectively reverts the change made in commit f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") as the race condition percpu_counter_sum_all() was invented to avoid is now handled directly in percpu_counter_sum() and nobody needs to care about summing racing with cpu unplug anymore. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- kernel/fork.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index f68954d05e89..bcc3085e79ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -755,11 +755,6 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (likely(!x)) - continue; - - /* Making sure this is not due to race with CPU offlining. */ - x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); -- cgit From e9b60c7f97130795c7aa81a649ae4b93a172a277 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:03 -0700 Subject: pcpcntr: remove percpu_counter_sum_all() percpu_counter_sum_all() is now redundant as the race condition it was invented to handle is now dealt with by percpu_counter_sum() directly and all users of percpu_counter_sum_all() have been removed. Remove it. This effectively reverts the changes made in f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") except for the cpumask iteration that fixes percpu_counter_sum() made earlier in this series. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/percpu_counter.h | 6 ------ lib/percpu_counter.c | 40 +++++++++++----------------------------- 2 files changed, 11 insertions(+), 35 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 521a733e21a9..75b73c83bc9d 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -45,7 +45,6 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); -s64 percpu_counter_sum_all(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); @@ -196,11 +195,6 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } -static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc) -{ - return percpu_counter_read(fbc); -} - static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 0e096311e0c0..5004463c4f9f 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -122,23 +122,6 @@ void percpu_counter_sync(struct percpu_counter *fbc) } EXPORT_SYMBOL(percpu_counter_sync); -static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, - const struct cpumask *cpu_mask) -{ - s64 ret; - int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&fbc->lock, flags); - ret = fbc->count; - for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) { - s32 *pcount = per_cpu_ptr(fbc->counters, cpu); - ret += *pcount; - } - raw_spin_unlock_irqrestore(&fbc->lock, flags); - return ret; -} - /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). @@ -153,22 +136,21 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { + s64 ret; + int cpu; + unsigned long flags; - return __percpu_counter_sum_mask(fbc, cpu_dying_mask); + raw_spin_lock_irqsave(&fbc->lock, flags); + ret = fbc->count; + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + s32 *pcount = per_cpu_ptr(fbc->counters, cpu); + ret += *pcount; + } + raw_spin_unlock_irqrestore(&fbc->lock, flags); + return ret; } EXPORT_SYMBOL(__percpu_counter_sum); -/* - * This is slower version of percpu_counter_sum as it traverses all possible - * cpus. Use this only in the cases where accurate data is needed in the - * presense of CPUs getting offlined. - */ -s64 percpu_counter_sum_all(struct percpu_counter *fbc) -{ - return __percpu_counter_sum_mask(fbc, cpu_possible_mask); -} -EXPORT_SYMBOL(percpu_counter_sum_all); - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { -- cgit