From 6802f9347993a534797847f627c27f4334067945 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 10 Apr 2024 16:33:10 -0500 Subject: [PATCH 01/10] cpumask: Add for_each_cpu_from() Add for_each_cpu_from() as a generic cpumask macro. for_each_cpu_from() is the same as for_each_cpu(), except it starts at @cpu instead of zero. Signed-off-by: Kyle Meyer Acked-by: Yury Norov Signed-off-by: Yury Norov --- include/linux/cpumask.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1c29947db848..d75060fbd058 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -368,6 +368,16 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta #define for_each_cpu_or(cpu, mask1, mask2) \ for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) +/** + * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask. + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask pointer + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_from(cpu, mask) \ + for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) + /** * cpumask_any_but - return a "random" in a cpumask, but not this one. * @mask: the cpumask to search From 05037e5f0f17935a86861f9610f941ebf346a95e Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 10 Apr 2024 16:33:11 -0500 Subject: [PATCH 02/10] sched/topology: Optimize topology_span_sane() Optimize topology_span_sane() by removing duplicate comparisons. Since topology_span_sane() is called inside of for_each_cpu(), each previous CPU has already been compared against every other CPU. The current CPU only needs to be compared against higher-numbered CPUs. The total number of comparisons is reduced from N * (N - 1) to N * (N - 1) / 2 on each non-NUMA scheduling domain level. Signed-off-by: Kyle Meyer Reviewed-by: Yury Norov Acked-by: Vincent Guittot Signed-off-by: Yury Norov --- kernel/sched/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..b6bcafc09969 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2347,7 +2347,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve static bool topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu) { - int i; + int i = cpu + 1; /* NUMA levels are allowed to overlap */ if (tl->flags & SDTL_OVERLAP) @@ -2359,9 +2359,7 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, * breaking the sched_group lists - i.e. a later get_group() pass * breaks the linking done for an earlier span. */ - for_each_cpu(i, cpu_map) { - if (i == cpu) - continue; + for_each_cpu_from(i, cpu_map) { /* * We should 'and' all those masks with 'cpu_map' to exactly * match the topology we're about to build, but that can only From efe3a85eab78b6cc02bdfd16aec4410111ea69c0 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 3 May 2024 12:12:00 -0700 Subject: [PATCH 03/10] Compiler Attributes: Add __always_used macro In some cases like performance benchmarking, we need to call a function, but don't need to read the returned value. If compiler recognizes the function as pure or const, it can remove the function invocation, which is not what we want. To prevent that, the common practice is assigning the return value to a temporary static volatile variable. From compiler's point of view, the variable is unused because never read back after been assigned. To make sure the variable is always emitted, we provide a __used attribute. This works with GCC, but clang still emits Wunused-but-set-variable. To suppress that warning, we need to teach clang to do that with the 'unused' attribute. Nathan Chancellor explained that in details: While having used and unused attributes together might look unusual, reading the GCC attribute manual makes it seem like these attributes fulfill similar yet different roles, __unused__ prevents any unused warnings while __used__ forces the variable to be emitted. A strict reading of that does not make it seem like __used__ implies disabling unused warnings The compiler documentation makes it clear what happens behind the 'used' and 'unused' attributes, but the chosen names may confuse readers if such combination catches an eye in a random code. This patch adds __always_used macro, which combines both attributes and comments on what happens for those interested in details. Suggested-by: Nathan Chancellor Reported-by: kernel test robot Reviewed-by: Nathan Chancellor Closes: https://lore.kernel.org/oe-kbuild-all/202405030808.UsoMKFNP-lkp@intel.com/ Acked-by: Miguel Ojeda Signed-off-by: Yury Norov --- include/linux/compiler_attributes.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 8bdf6e0918c1..32284cd26d52 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -361,6 +361,19 @@ */ #define __used __attribute__((__used__)) +/* + * The __used attribute guarantees that the attributed variable will be + * always emitted by a compiler. It doesn't prevent the compiler from + * throwing 'unused' warnings when it can't detect how the variable is + * actually used. It's a compiler implementation details either emit + * the warning in that case or not. + * + * The combination of both 'used' and 'unused' attributes ensures that + * the variable would be emitted, and will not trigger 'unused' warnings. + * The attribute is applicable for functions, static and global variables. + */ +#define __always_used __used __maybe_unused + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warn_005funused_005fresult-function-attribute * clang: https://clang.llvm.org/docs/AttributeReference.html#nodiscard-warn-unused-result From 0a2c6664e56f0dff7535c3d3d9a6174279e18acc Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 2 May 2024 17:24:42 +0800 Subject: [PATCH 04/10] lib/test_bitops: Add benchmark test for fns() Introduce a benchmark test for the fns(). It measures the total time taken by fns() to process 10,000 test data generated using get_random_bytes() for each n in the range [0, BITS_PER_LONG). example: test_bitops: fns: 7637268 ns CC: Andrew Morton CC: Rasmus Villemoes CC: David Laight Signed-off-by: Kuan-Wei Chiu Suggested-by: Yury Norov Signed-off-by: Yury Norov --- lib/test_bitops.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/test_bitops.c b/lib/test_bitops.c index 3b7bcbee84db..55669624bb28 100644 --- a/lib/test_bitops.c +++ b/lib/test_bitops.c @@ -5,9 +5,11 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include +#include /* a tiny module only meant to test * @@ -50,6 +52,30 @@ static unsigned long order_comb_long[][2] = { }; #endif +static int __init test_fns(void) +{ + static volatile __always_used unsigned long tmp __initdata; + unsigned long *buf __free(kfree) = NULL; + unsigned int i, n; + ktime_t time; + + buf = kmalloc_array(10000, sizeof(unsigned long), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + get_random_bytes(buf, 10000 * sizeof(unsigned long)); + time = ktime_get(); + + for (n = 0; n < BITS_PER_LONG; n++) + for (i = 0; i < 10000; i++) + tmp = fns(buf[i], n); + + time = ktime_get() - time; + pr_err("fns: %18llu ns\n", time); + + return 0; +} + static int __init test_bitops_startup(void) { int i, bit_set; @@ -94,6 +120,8 @@ static int __init test_bitops_startup(void) if (bit_set != BITOPS_LAST) pr_err("ERROR: FOUND SET BIT %d\n", bit_set); + test_fns(); + pr_info("Completed bitops test\n"); return 0; From 1c2aa5619348f7573d6f2269e04fd1dac8eddc47 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 2 May 2024 17:24:43 +0800 Subject: [PATCH 05/10] bitops: Optimize fns() for improved performance The current fns() repeatedly uses __ffs() to find the index of the least significant bit and then clears the corresponding bit using __clear_bit(). The method for clearing the least significant bit can be optimized by using word &= word - 1 instead. Typically, the execution time of one __ffs() plus one __clear_bit() is longer than that of a bitwise AND operation and a subtraction. To improve performance, the loop for clearing the least significant bit has been replaced with word &= word - 1, followed by a single __ffs() operation to obtain the answer. This change reduces the number of __ffs() iterations from n to just one, enhancing overall performance. This modification significantly accelerates the fns() function in the test_bitops benchmark, improving its speed by approximately 7.6 times. Additionally, it enhances the performance of find_nth_bit() in the find_bit benchmark by approximately 26%. Before: test_bitops: fns: 58033164 ns find_nth_bit: 4254313 ns, 16525 iterations After: test_bitops: fns: 7637268 ns find_nth_bit: 3362863 ns, 16501 iterations CC: Andrew Morton CC: Rasmus Villemoes Signed-off-by: Kuan-Wei Chiu Signed-off-by: Yury Norov --- include/linux/bitops.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2ba557e067fe..57ecef354f47 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -254,16 +254,10 @@ static inline unsigned long __ffs64(u64 word) */ static inline unsigned long fns(unsigned long word, unsigned int n) { - unsigned int bit; + while (word && n--) + word &= word - 1; - while (word) { - bit = __ffs(word); - if (n-- == 0) - return bit; - __clear_bit(bit, &word); - } - - return BITS_PER_LONG; + return word ? __ffs(word) : BITS_PER_LONG; } /** From 77db1920a88103e8ef9ee58130df7c970aea3d17 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 2 May 2024 09:56:33 -0700 Subject: [PATCH 06/10] lib: make test_bitops compilable into the kernel image The test now is limited to be compiled as a module. There's no technical reason for it. Now that the test bears some performance benchmarks, it would be reasonable to run it at kernel load time, before userspace starts, to reduce possible jitter. Reviewed-by: Kuan-Wei Chiu Signed-off-by: Yury Norov --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c63a5fbf1f1c..fc8fe1ea5b49 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2436,7 +2436,6 @@ config TEST_LKM config TEST_BITOPS tristate "Test module for compilation of bitops operations" - depends on m help This builds the "test_bitops" module that is much like the TEST_LKM module except that it does a basic exercise of the From 0b2811ba11b04353033237359c9d042eb0cdc1c1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 2 May 2024 10:12:56 -0700 Subject: [PATCH 07/10] bitmap: relax find_nth_bit() limitation on return value The function claims to return the bitmap size, if Nth bit doesn't exist. This rule is violated in inline case because the fns() that is used there doesn't know anything about size of the bitmap. So, relax this requirement to '>= size', and make the outline implementation a bit cheaper. All in-tree kernel users of find_nth_bit() are safe against that. Reported-by: Rasmus Villemoes Closes: https://lore.kernel.org/all/Zi50cAgR8nZvgLa3@yury-ThinkPad/T/#m6da806a0525e74dcc91f35e5f20766ed4e853e8a Signed-off-by: Yury Norov --- include/linux/find.h | 2 +- lib/find_bit.c | 2 +- lib/test_bitmap.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/find.h b/include/linux/find.h index c69598e383c1..02751e43d448 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -220,7 +220,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. - * If no such, returns @size. + * If no such, returns >= @size. */ static inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) diff --git a/lib/find_bit.c b/lib/find_bit.c index 32f99e9a670e..0bddfc3ff248 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -87,7 +87,7 @@ out: \ if (sz % BITS_PER_LONG) \ tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz); \ found: \ - sz = min(idx * BITS_PER_LONG + fns(tmp, nr), sz); \ + sz = idx * BITS_PER_LONG + fns(tmp, nr); \ out: \ sz; \ }) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 6b2b33579f56..088838f829c9 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -244,7 +244,7 @@ static void __init test_find_nth_bit(void) expect_eq_uint(60, find_nth_bit(bmap, 64 * 3, 5)); expect_eq_uint(80, find_nth_bit(bmap, 64 * 3, 6)); expect_eq_uint(123, find_nth_bit(bmap, 64 * 3, 7)); - expect_eq_uint(64 * 3, find_nth_bit(bmap, 64 * 3, 8)); + expect_eq_uint(0, !!(find_nth_bit(bmap, 64 * 3, 8) < 64 * 3)); expect_eq_uint(10, find_nth_bit(bmap, 64 * 3 - 1, 0)); expect_eq_uint(20, find_nth_bit(bmap, 64 * 3 - 1, 1)); @@ -254,7 +254,7 @@ static void __init test_find_nth_bit(void) expect_eq_uint(60, find_nth_bit(bmap, 64 * 3 - 1, 5)); expect_eq_uint(80, find_nth_bit(bmap, 64 * 3 - 1, 6)); expect_eq_uint(123, find_nth_bit(bmap, 64 * 3 - 1, 7)); - expect_eq_uint(64 * 3 - 1, find_nth_bit(bmap, 64 * 3 - 1, 8)); + expect_eq_uint(0, !!(find_nth_bit(bmap, 64 * 3 - 1, 8) < 64 * 3 - 1)); for_each_set_bit(bit, exp1, EXP1_IN_BITS) { b = find_nth_bit(exp1, EXP1_IN_BITS, cnt++); From fe708f915541b0b7b9ba066f73007fde69a0d2ea Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 2 May 2024 16:32:04 -0700 Subject: [PATCH 08/10] MAINTAINERS: add BITOPS API record Bitops API is the very basic, and it's widely used by the kernel. But corresponding files are not maintained. Bitmaps actively use bit operations, and big share of bitops material already moves through the bitmap branch. I would like to take a closer look to bitops. This patch creates a BITOPS API record in the MAINTAINERS, and adds Rasmus as a reviewer, and myself as a maintainer of those files. CC: Rasmus Villemoes Signed-off-by: Yury Norov --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c23fda1aa1f0..112a9b32b12e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3638,6 +3638,20 @@ F: tools/include/vdso/bits.h F: tools/lib/bitmap.c F: tools/lib/find_bit.c +BITOPS API +M: Yury Norov +R: Rasmus Villemoes +S: Maintained +F: arch/*/include/asm/bitops.h +F: arch/*/include/asm/bitops_32.h +F: arch/*/include/asm/bitops_64.h +F: arch/*/lib/bitops.c +F: include/asm-generic/bitops +F: include/asm-generic/bitops.h +F: include/linux/bitops.h +F: lib/test_bitops.c +F: tools/*/bitops* + BLINKM RGB LED DRIVER M: Jan-Simon Moeller S: Maintained From 9f2c2d6ba13da08643c65b948ce5e3d616864c47 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 May 2024 23:01:31 +0300 Subject: [PATCH 09/10] bitops: Move aligned_byte_mask() to wordpart.h The bitops.h is for bit related operations. The aligned_byte_mask() is about byte (or part of the machine word) operations, for which we have a separate header, move the mentioned macro to wordpart.h to consolidate similar operations. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitops.h | 7 ------- include/linux/wordpart.h | 7 +++++++ lib/usercopy.c | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 57ecef354f47..3313d2c04e6d 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -8,13 +8,6 @@ #include -/* Set bits in the first 'n' bytes when loaded from memory */ -#ifdef __LITTLE_ENDIAN -# define aligned_byte_mask(n) ((1UL << 8*(n))-1) -#else -# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) -#endif - #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) diff --git a/include/linux/wordpart.h b/include/linux/wordpart.h index f6f8f83b15b0..4ca1ba66d2f0 100644 --- a/include/linux/wordpart.h +++ b/include/linux/wordpart.h @@ -39,4 +39,11 @@ */ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) +/* Set bits in the first 'n' bytes when loaded from memory */ +#ifdef __LITTLE_ENDIAN +# define aligned_byte_mask(n) ((1UL << 8*(n))-1) +#else +# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) +#endif + #endif // _LINUX_WORDPART_H diff --git a/lib/usercopy.c b/lib/usercopy.c index d29fe29c6849..4b62e6299cc8 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -3,6 +3,7 @@ #include #include #include +#include #include /* out-of-line parts */ From 5671dca241b9a2f4ecf88d8e992041cfb580e0a5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 May 2024 23:01:32 +0300 Subject: [PATCH 10/10] usercopy: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- lib/usercopy.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/usercopy.c b/lib/usercopy.c index 4b62e6299cc8..499a7a7d54db 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,10 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include +#include +#include #include #include +#include +#include +#include #include #include -#include /* out-of-line parts */