2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2015-04-17 03:43:16 +08:00
|
|
|
/* bit search implementation
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
|
|
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
|
|
*
|
2015-04-17 03:43:16 +08:00
|
|
|
* Copyright (C) 2008 IBM Corporation
|
|
|
|
* 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
|
|
|
|
* (Inspired by David Howell's find_next_bit implementation)
|
|
|
|
*
|
2015-04-17 03:43:13 +08:00
|
|
|
* Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
|
|
|
|
* size and improve performance, 2015.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/bitops.h>
|
2015-04-17 03:43:16 +08:00
|
|
|
#include <linux/bitmap.h>
|
2011-11-17 10:29:17 +08:00
|
|
|
#include <linux/export.h>
|
2020-12-16 12:42:48 +08:00
|
|
|
#include <linux/math.h>
|
2020-10-16 11:10:21 +08:00
|
|
|
#include <linux/minmax.h>
|
2020-12-16 12:42:48 +08:00
|
|
|
#include <linux/swab.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
lib/find_bit: introduce FIND_FIRST_BIT() macro
Now that we have many flavors of find_first_bit(), and expect even more,
it's better to have one macro that generates optimal code for all and makes
maintaining of slightly different functions simpler.
The logic common to all versions is moved to the new macro, and all the
flavors are generated by providing an FETCH macro-parameter, like
in this example:
#define FIND_FIRST_BIT(FETCH, MUNGE, size) ...
find_first_ornot_and_bit(addr1, addr2, addr3, size)
{
return FIND_FIRST_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], /* nop */, size);
}
The FETCH may be of any complexity, as soon as it only refers
the bitmap(s) and an iterator idx.
MUNGE is here to support _le code generation for BE builds. May be
empty.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
2022-09-15 10:07:27 +08:00
|
|
|
/*
|
|
|
|
* Common helper for find_bit() function family
|
|
|
|
* @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
|
|
|
|
* @MUNGE: The expression that post-processes a word containing found bit (may be empty)
|
|
|
|
* @size: The bitmap size in bits
|
|
|
|
*/
|
|
|
|
#define FIND_FIRST_BIT(FETCH, MUNGE, size) \
|
|
|
|
({ \
|
|
|
|
unsigned long idx, val, sz = (size); \
|
|
|
|
\
|
|
|
|
for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \
|
|
|
|
val = (FETCH); \
|
|
|
|
if (val) { \
|
|
|
|
sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz); \
|
|
|
|
break; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
sz; \
|
|
|
|
})
|
|
|
|
|
2020-01-31 14:16:43 +08:00
|
|
|
#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
|
|
|
|
!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \
|
|
|
|
!defined(find_next_and_bit)
|
2008-03-11 23:17:19 +08:00
|
|
|
/*
|
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).
Implement a direct join (find a nonzero bit on the incrementally built
join). Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).
For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory
usage. Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].
[1] Approximate benchmark code:
```
unsigned long src1p[nr_cpumask_longs] = {pattern1};
unsigned long src2p[nr_cpumask_longs] = {pattern2};
for (/*a bunch of repetitions*/) {
for (int n = -1; n <= nr_cpu_ids; ++n) {
asm volatile("" : "+rm"(src1p)); // prevent any optimization
asm volatile("" : "+rm"(src2p));
unsigned long result = cpumask_next_and(n, src1p, src2p);
asm volatile("" : "+rm"(result));
}
}
```
Results:
pattern1 pattern2 time_before/time_after
0x0000ffff 0x0000ffff 1.65
0x0000ffff 0x00005555 2.24
0x0000ffff 0x00001111 2.94
0x0000ffff 0x00000000 14.0
0x00005555 0x0000ffff 1.67
0x00005555 0x00005555 1.71
0x00005555 0x00001111 1.90
0x00005555 0x00000000 6.58
0x00001111 0x0000ffff 1.46
0x00001111 0x00005555 1.49
0x00001111 0x00001111 1.45
0x00001111 0x00000000 3.10
0x00000000 0x0000ffff 1.18
0x00000000 0x00005555 1.18
0x00000000 0x00001111 1.17
0x00000000 0x00000000 1.25
-----------------------------
geo.mean 2.06
[2] test_find_next_bit, X86 (skylake)
[ 3913.477422] Start testing find_bit() with random-filled bitmap
[ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
[ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
[ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
[ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
[ 3913.480216] Start testing find_next_and_bit() with random-filled
bitmap
[ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
[ 3913.481075] Start testing find_bit() with sparse bitmap
[ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
[ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
[ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
[ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
[ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
[ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations
[3] test_find_next_bit, arm (v7 odroid XU3).
[ 267.206928] Start testing find_bit() with random-filled bitmap
[ 267.214752] find_next_bit: 4474 cycles, 16419 iterations
[ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[ 267.229294] find_last_bit: 4209 cycles, 16419 iterations
[ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[ 267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[ 267.309422] Start testing find_bit() with sparse bitmap
[ 267.316054] find_next_bit: 191 cycles, 66 iterations
[ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[ 267.329803] find_last_bit: 84 cycles, 66 iterations
[ 267.336169] find_first_bit: 4118 cycles, 66 iterations
[ 267.342627] Start testing find_next_and_bit() with sparse bitmap
[ 267.356919] find_next_and_bit: 91 cycles, 1 iterations
[courbet@google.com: v6]
Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-07 07:38:34 +08:00
|
|
|
* This is a common helper function for find_next_bit, find_next_zero_bit, and
|
|
|
|
* find_next_and_bit. The differences are:
|
|
|
|
* - The "invert" argument, which is XORed with each fetched word before
|
|
|
|
* searching it for one bits.
|
|
|
|
* - The optional "addr2", which is anded with "addr1" if present.
|
2006-03-26 17:39:11 +08:00
|
|
|
*/
|
2021-05-07 09:03:03 +08:00
|
|
|
unsigned long _find_next_bit(const unsigned long *addr1,
|
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).
Implement a direct join (find a nonzero bit on the incrementally built
join). Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).
For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory
usage. Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].
[1] Approximate benchmark code:
```
unsigned long src1p[nr_cpumask_longs] = {pattern1};
unsigned long src2p[nr_cpumask_longs] = {pattern2};
for (/*a bunch of repetitions*/) {
for (int n = -1; n <= nr_cpu_ids; ++n) {
asm volatile("" : "+rm"(src1p)); // prevent any optimization
asm volatile("" : "+rm"(src2p));
unsigned long result = cpumask_next_and(n, src1p, src2p);
asm volatile("" : "+rm"(result));
}
}
```
Results:
pattern1 pattern2 time_before/time_after
0x0000ffff 0x0000ffff 1.65
0x0000ffff 0x00005555 2.24
0x0000ffff 0x00001111 2.94
0x0000ffff 0x00000000 14.0
0x00005555 0x0000ffff 1.67
0x00005555 0x00005555 1.71
0x00005555 0x00001111 1.90
0x00005555 0x00000000 6.58
0x00001111 0x0000ffff 1.46
0x00001111 0x00005555 1.49
0x00001111 0x00001111 1.45
0x00001111 0x00000000 3.10
0x00000000 0x0000ffff 1.18
0x00000000 0x00005555 1.18
0x00000000 0x00001111 1.17
0x00000000 0x00000000 1.25
-----------------------------
geo.mean 2.06
[2] test_find_next_bit, X86 (skylake)
[ 3913.477422] Start testing find_bit() with random-filled bitmap
[ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
[ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
[ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
[ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
[ 3913.480216] Start testing find_next_and_bit() with random-filled
bitmap
[ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
[ 3913.481075] Start testing find_bit() with sparse bitmap
[ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
[ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
[ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
[ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
[ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
[ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations
[3] test_find_next_bit, arm (v7 odroid XU3).
[ 267.206928] Start testing find_bit() with random-filled bitmap
[ 267.214752] find_next_bit: 4474 cycles, 16419 iterations
[ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[ 267.229294] find_last_bit: 4209 cycles, 16419 iterations
[ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[ 267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[ 267.309422] Start testing find_bit() with sparse bitmap
[ 267.316054] find_next_bit: 191 cycles, 66 iterations
[ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[ 267.329803] find_last_bit: 84 cycles, 66 iterations
[ 267.336169] find_first_bit: 4118 cycles, 66 iterations
[ 267.342627] Start testing find_next_and_bit() with sparse bitmap
[ 267.356919] find_next_and_bit: 91 cycles, 1 iterations
[courbet@google.com: v6]
Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-07 07:38:34 +08:00
|
|
|
const unsigned long *addr2, unsigned long nbits,
|
2020-01-31 14:16:43 +08:00
|
|
|
unsigned long start, unsigned long invert, unsigned long le)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2020-01-31 14:16:43 +08:00
|
|
|
unsigned long tmp, mask;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-02-25 07:00:58 +08:00
|
|
|
if (unlikely(start >= nbits))
|
2015-04-17 03:43:13 +08:00
|
|
|
return nbits;
|
|
|
|
|
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).
Implement a direct join (find a nonzero bit on the incrementally built
join). Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).
For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory
usage. Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].
[1] Approximate benchmark code:
```
unsigned long src1p[nr_cpumask_longs] = {pattern1};
unsigned long src2p[nr_cpumask_longs] = {pattern2};
for (/*a bunch of repetitions*/) {
for (int n = -1; n <= nr_cpu_ids; ++n) {
asm volatile("" : "+rm"(src1p)); // prevent any optimization
asm volatile("" : "+rm"(src2p));
unsigned long result = cpumask_next_and(n, src1p, src2p);
asm volatile("" : "+rm"(result));
}
}
```
Results:
pattern1 pattern2 time_before/time_after
0x0000ffff 0x0000ffff 1.65
0x0000ffff 0x00005555 2.24
0x0000ffff 0x00001111 2.94
0x0000ffff 0x00000000 14.0
0x00005555 0x0000ffff 1.67
0x00005555 0x00005555 1.71
0x00005555 0x00001111 1.90
0x00005555 0x00000000 6.58
0x00001111 0x0000ffff 1.46
0x00001111 0x00005555 1.49
0x00001111 0x00001111 1.45
0x00001111 0x00000000 3.10
0x00000000 0x0000ffff 1.18
0x00000000 0x00005555 1.18
0x00000000 0x00001111 1.17
0x00000000 0x00000000 1.25
-----------------------------
geo.mean 2.06
[2] test_find_next_bit, X86 (skylake)
[ 3913.477422] Start testing find_bit() with random-filled bitmap
[ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
[ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
[ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
[ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
[ 3913.480216] Start testing find_next_and_bit() with random-filled
bitmap
[ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
[ 3913.481075] Start testing find_bit() with sparse bitmap
[ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
[ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
[ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
[ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
[ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
[ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations
[3] test_find_next_bit, arm (v7 odroid XU3).
[ 267.206928] Start testing find_bit() with random-filled bitmap
[ 267.214752] find_next_bit: 4474 cycles, 16419 iterations
[ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[ 267.229294] find_last_bit: 4209 cycles, 16419 iterations
[ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[ 267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[ 267.309422] Start testing find_bit() with sparse bitmap
[ 267.316054] find_next_bit: 191 cycles, 66 iterations
[ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[ 267.329803] find_last_bit: 84 cycles, 66 iterations
[ 267.336169] find_first_bit: 4118 cycles, 66 iterations
[ 267.342627] Start testing find_next_and_bit() with sparse bitmap
[ 267.356919] find_next_and_bit: 91 cycles, 1 iterations
[courbet@google.com: v6]
Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-07 07:38:34 +08:00
|
|
|
tmp = addr1[start / BITS_PER_LONG];
|
|
|
|
if (addr2)
|
|
|
|
tmp &= addr2[start / BITS_PER_LONG];
|
|
|
|
tmp ^= invert;
|
2015-04-17 03:43:13 +08:00
|
|
|
|
|
|
|
/* Handle 1st word. */
|
2020-01-31 14:16:43 +08:00
|
|
|
mask = BITMAP_FIRST_WORD_MASK(start);
|
|
|
|
if (le)
|
|
|
|
mask = swab(mask);
|
|
|
|
|
|
|
|
tmp &= mask;
|
|
|
|
|
2015-04-17 03:43:13 +08:00
|
|
|
start = round_down(start, BITS_PER_LONG);
|
|
|
|
|
|
|
|
while (!tmp) {
|
|
|
|
start += BITS_PER_LONG;
|
|
|
|
if (start >= nbits)
|
|
|
|
return nbits;
|
|
|
|
|
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).
Implement a direct join (find a nonzero bit on the incrementally built
join). Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).
For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory
usage. Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].
[1] Approximate benchmark code:
```
unsigned long src1p[nr_cpumask_longs] = {pattern1};
unsigned long src2p[nr_cpumask_longs] = {pattern2};
for (/*a bunch of repetitions*/) {
for (int n = -1; n <= nr_cpu_ids; ++n) {
asm volatile("" : "+rm"(src1p)); // prevent any optimization
asm volatile("" : "+rm"(src2p));
unsigned long result = cpumask_next_and(n, src1p, src2p);
asm volatile("" : "+rm"(result));
}
}
```
Results:
pattern1 pattern2 time_before/time_after
0x0000ffff 0x0000ffff 1.65
0x0000ffff 0x00005555 2.24
0x0000ffff 0x00001111 2.94
0x0000ffff 0x00000000 14.0
0x00005555 0x0000ffff 1.67
0x00005555 0x00005555 1.71
0x00005555 0x00001111 1.90
0x00005555 0x00000000 6.58
0x00001111 0x0000ffff 1.46
0x00001111 0x00005555 1.49
0x00001111 0x00001111 1.45
0x00001111 0x00000000 3.10
0x00000000 0x0000ffff 1.18
0x00000000 0x00005555 1.18
0x00000000 0x00001111 1.17
0x00000000 0x00000000 1.25
-----------------------------
geo.mean 2.06
[2] test_find_next_bit, X86 (skylake)
[ 3913.477422] Start testing find_bit() with random-filled bitmap
[ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
[ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
[ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
[ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
[ 3913.480216] Start testing find_next_and_bit() with random-filled
bitmap
[ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
[ 3913.481075] Start testing find_bit() with sparse bitmap
[ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
[ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
[ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
[ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
[ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
[ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations
[3] test_find_next_bit, arm (v7 odroid XU3).
[ 267.206928] Start testing find_bit() with random-filled bitmap
[ 267.214752] find_next_bit: 4474 cycles, 16419 iterations
[ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[ 267.229294] find_last_bit: 4209 cycles, 16419 iterations
[ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[ 267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[ 267.309422] Start testing find_bit() with sparse bitmap
[ 267.316054] find_next_bit: 191 cycles, 66 iterations
[ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[ 267.329803] find_last_bit: 84 cycles, 66 iterations
[ 267.336169] find_first_bit: 4118 cycles, 66 iterations
[ 267.342627] Start testing find_next_and_bit() with sparse bitmap
[ 267.356919] find_next_and_bit: 91 cycles, 1 iterations
[courbet@google.com: v6]
Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-07 07:38:34 +08:00
|
|
|
tmp = addr1[start / BITS_PER_LONG];
|
|
|
|
if (addr2)
|
|
|
|
tmp &= addr2[start / BITS_PER_LONG];
|
|
|
|
tmp ^= invert;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2020-01-31 14:16:43 +08:00
|
|
|
if (le)
|
|
|
|
tmp = swab(tmp);
|
|
|
|
|
2015-04-17 03:43:13 +08:00
|
|
|
return min(start + __ffs(tmp), nbits);
|
2006-03-26 17:39:11 +08:00
|
|
|
}
|
2021-05-07 09:03:03 +08:00
|
|
|
EXPORT_SYMBOL(_find_next_bit);
|
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).
Implement a direct join (find a nonzero bit on the incrementally built
join). Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).
For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory
usage. Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].
[1] Approximate benchmark code:
```
unsigned long src1p[nr_cpumask_longs] = {pattern1};
unsigned long src2p[nr_cpumask_longs] = {pattern2};
for (/*a bunch of repetitions*/) {
for (int n = -1; n <= nr_cpu_ids; ++n) {
asm volatile("" : "+rm"(src1p)); // prevent any optimization
asm volatile("" : "+rm"(src2p));
unsigned long result = cpumask_next_and(n, src1p, src2p);
asm volatile("" : "+rm"(result));
}
}
```
Results:
pattern1 pattern2 time_before/time_after
0x0000ffff 0x0000ffff 1.65
0x0000ffff 0x00005555 2.24
0x0000ffff 0x00001111 2.94
0x0000ffff 0x00000000 14.0
0x00005555 0x0000ffff 1.67
0x00005555 0x00005555 1.71
0x00005555 0x00001111 1.90
0x00005555 0x00000000 6.58
0x00001111 0x0000ffff 1.46
0x00001111 0x00005555 1.49
0x00001111 0x00001111 1.45
0x00001111 0x00000000 3.10
0x00000000 0x0000ffff 1.18
0x00000000 0x00005555 1.18
0x00000000 0x00001111 1.17
0x00000000 0x00000000 1.25
-----------------------------
geo.mean 2.06
[2] test_find_next_bit, X86 (skylake)
[ 3913.477422] Start testing find_bit() with random-filled bitmap
[ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
[ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
[ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
[ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
[ 3913.480216] Start testing find_next_and_bit() with random-filled
bitmap
[ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
[ 3913.481075] Start testing find_bit() with sparse bitmap
[ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
[ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
[ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
[ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
[ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
[ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations
[3] test_find_next_bit, arm (v7 odroid XU3).
[ 267.206928] Start testing find_bit() with random-filled bitmap
[ 267.214752] find_next_bit: 4474 cycles, 16419 iterations
[ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[ 267.229294] find_last_bit: 4209 cycles, 16419 iterations
[ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[ 267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[ 267.309422] Start testing find_bit() with sparse bitmap
[ 267.316054] find_next_bit: 191 cycles, 66 iterations
[ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[ 267.329803] find_last_bit: 84 cycles, 66 iterations
[ 267.336169] find_first_bit: 4118 cycles, 66 iterations
[ 267.342627] Start testing find_next_and_bit() with sparse bitmap
[ 267.356919] find_next_and_bit: 91 cycles, 1 iterations
[courbet@google.com: v6]
Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-07 07:38:34 +08:00
|
|
|
#endif
|
|
|
|
|
2011-05-27 07:26:09 +08:00
|
|
|
#ifndef find_first_bit
|
2008-04-01 17:46:19 +08:00
|
|
|
/*
|
|
|
|
* Find the first set bit in a memory region.
|
|
|
|
*/
|
2021-05-07 09:03:14 +08:00
|
|
|
unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
|
2008-04-01 17:46:19 +08:00
|
|
|
{
|
lib/find_bit: introduce FIND_FIRST_BIT() macro
Now that we have many flavors of find_first_bit(), and expect even more,
it's better to have one macro that generates optimal code for all and makes
maintaining of slightly different functions simpler.
The logic common to all versions is moved to the new macro, and all the
flavors are generated by providing an FETCH macro-parameter, like
in this example:
#define FIND_FIRST_BIT(FETCH, MUNGE, size) ...
find_first_ornot_and_bit(addr1, addr2, addr3, size)
{
return FIND_FIRST_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], /* nop */, size);
}
The FETCH may be of any complexity, as soon as it only refers
the bitmap(s) and an iterator idx.
MUNGE is here to support _le code generation for BE builds. May be
empty.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
2022-09-15 10:07:27 +08:00
|
|
|
return FIND_FIRST_BIT(addr[idx], /* nop */, size);
|
2008-04-01 17:46:19 +08:00
|
|
|
}
|
2021-05-07 09:03:14 +08:00
|
|
|
EXPORT_SYMBOL(_find_first_bit);
|
2011-05-27 07:26:09 +08:00
|
|
|
#endif
|
2008-04-01 17:46:19 +08:00
|
|
|
|
2021-08-15 05:17:01 +08:00
|
|
|
#ifndef find_first_and_bit
|
|
|
|
/*
|
|
|
|
* Find the first set bit in two memory regions.
|
|
|
|
*/
|
|
|
|
unsigned long _find_first_and_bit(const unsigned long *addr1,
|
|
|
|
const unsigned long *addr2,
|
|
|
|
unsigned long size)
|
|
|
|
{
|
lib/find_bit: introduce FIND_FIRST_BIT() macro
Now that we have many flavors of find_first_bit(), and expect even more,
it's better to have one macro that generates optimal code for all and makes
maintaining of slightly different functions simpler.
The logic common to all versions is moved to the new macro, and all the
flavors are generated by providing an FETCH macro-parameter, like
in this example:
#define FIND_FIRST_BIT(FETCH, MUNGE, size) ...
find_first_ornot_and_bit(addr1, addr2, addr3, size)
{
return FIND_FIRST_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], /* nop */, size);
}
The FETCH may be of any complexity, as soon as it only refers
the bitmap(s) and an iterator idx.
MUNGE is here to support _le code generation for BE builds. May be
empty.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
2022-09-15 10:07:27 +08:00
|
|
|
return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size);
|
2021-08-15 05:17:01 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(_find_first_and_bit);
|
|
|
|
#endif
|
|
|
|
|
2011-05-27 07:26:09 +08:00
|
|
|
#ifndef find_first_zero_bit
|
2008-04-01 17:46:19 +08:00
|
|
|
/*
|
|
|
|
* Find the first cleared bit in a memory region.
|
|
|
|
*/
|
2021-05-07 09:03:14 +08:00
|
|
|
unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
|
2008-04-01 17:46:19 +08:00
|
|
|
{
|
lib/find_bit: introduce FIND_FIRST_BIT() macro
Now that we have many flavors of find_first_bit(), and expect even more,
it's better to have one macro that generates optimal code for all and makes
maintaining of slightly different functions simpler.
The logic common to all versions is moved to the new macro, and all the
flavors are generated by providing an FETCH macro-parameter, like
in this example:
#define FIND_FIRST_BIT(FETCH, MUNGE, size) ...
find_first_ornot_and_bit(addr1, addr2, addr3, size)
{
return FIND_FIRST_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], /* nop */, size);
}
The FETCH may be of any complexity, as soon as it only refers
the bitmap(s) and an iterator idx.
MUNGE is here to support _le code generation for BE builds. May be
empty.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
2022-09-15 10:07:27 +08:00
|
|
|
return FIND_FIRST_BIT(~addr[idx], /* nop */, size);
|
2008-04-01 17:46:19 +08:00
|
|
|
}
|
2021-05-07 09:03:14 +08:00
|
|
|
EXPORT_SYMBOL(_find_first_zero_bit);
|
2011-05-27 07:26:09 +08:00
|
|
|
#endif
|
2006-03-26 17:39:15 +08:00
|
|
|
|
2015-04-17 03:43:16 +08:00
|
|
|
#ifndef find_last_bit
|
2021-05-07 09:03:14 +08:00
|
|
|
unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
|
2015-04-17 03:43:16 +08:00
|
|
|
{
|
|
|
|
if (size) {
|
|
|
|
unsigned long val = BITMAP_LAST_WORD_MASK(size);
|
|
|
|
unsigned long idx = (size-1) / BITS_PER_LONG;
|
|
|
|
|
|
|
|
do {
|
|
|
|
val &= addr[idx];
|
|
|
|
if (val)
|
|
|
|
return idx * BITS_PER_LONG + __fls(val);
|
|
|
|
|
|
|
|
val = ~0ul;
|
|
|
|
} while (idx--);
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
2021-05-07 09:03:14 +08:00
|
|
|
EXPORT_SYMBOL(_find_last_bit);
|
2015-04-17 03:43:16 +08:00
|
|
|
#endif
|
|
|
|
|
bitops: introduce the for_each_set_clump8 macro
Pach series "Introduce the for_each_set_clump8 macro", v18.
While adding GPIO get_multiple/set_multiple callback support for various
drivers, I noticed a pattern of looping manifesting that would be useful
standardized as a macro.
This patchset introduces the for_each_set_clump8 macro and utilizes it
in several GPIO drivers. The for_each_set_clump macro8 facilitates a
for-loop syntax that iterates over a memory region entire groups of set
bits at a time.
For example, suppose you would like to iterate over a 32-bit integer 8
bits at a time, skipping over 8-bit groups with no set bit, where
XXXXXXXX represents the current 8-bit group:
Example: 10111110 00000000 11111111 00110011
First loop: 10111110 00000000 11111111 XXXXXXXX
Second loop: 10111110 00000000 XXXXXXXX 00110011
Third loop: XXXXXXXX 00000000 11111111 00110011
Each iteration of the loop returns the next 8-bit group that has at
least one set bit.
The for_each_set_clump8 macro has four parameters:
* start: set to the bit offset of the current clump
* clump: set to the current clump value
* bits: bitmap to search within
* size: bitmap size in number of bits
In this version of the patchset, the for_each_set_clump macro has been
reimplemented and simplified based on the suggestions provided by Rasmus
Villemoes and Andy Shevchenko in the version 4 submission.
In particular, the function of the for_each_set_clump macro has been
restricted to handle only 8-bit clumps; the drivers that use the
for_each_set_clump macro only handle 8-bit ports so a generic
for_each_set_clump implementation is not necessary. Thus, a solution
for large clumps (i.e. those larger than the width of a bitmap word)
can be postponed until a driver appears that actually requires such a
generic for_each_set_clump implementation.
For what it's worth, a semi-generic for_each_set_clump (i.e. for clumps
smaller than the width of a bitmap word) can be implemented by simply
replacing the hardcoded '8' and '0xFF' instances with respective
variables. I have not yet had a need for such an implementation, and
since it falls short of a true generic for_each_set_clump function, I
have decided to forgo such an implementation for now.
In addition, the bitmap_get_value8 and bitmap_set_value8 functions are
introduced to get and set 8-bit values respectively. Their use is based
on the behavior suggested in the patchset version 4 review.
This patch (of 14):
This macro iterates for each 8-bit group of bits (clump) with set bits,
within a bitmap memory region. For each iteration, "start" is set to
the bit offset of the found clump, while the respective clump value is
stored to the location pointed by "clump". Additionally, the
bitmap_get_value8 and bitmap_set_value8 functions are introduced to
respectively get and set an 8-bit value in a bitmap memory region.
[gustavo@embeddedor.com: fix potential sign-extension overflow]
Link: http://lkml.kernel.org/r/20191015184657.GA26541@embeddedor
[akpm@linux-foundation.org: s/ULL/UL/, per Joe]
[vilhelm.gray@gmail.com: add for_each_set_clump8 documentation]
Link: http://lkml.kernel.org/r/20191016161825.301082-1-vilhelm.gray@gmail.com
Link: http://lkml.kernel.org/r/893c3b4f03266c9496137cc98ac2b1bd27f92c73.1570641097.git.vilhelm.gray@gmail.com
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Suggested-by: Lukas Wunner <lukas@wunner.de>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Phil Reid <preid@electromag.com.au>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Mathias Duckeck <m.duckeck@kunbus.de>
Cc: Morten Hein Tiljeset <morten.tiljeset@prevas.dk>
Cc: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-05 08:50:57 +08:00
|
|
|
unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
|
|
|
|
unsigned long size, unsigned long offset)
|
|
|
|
{
|
|
|
|
offset = find_next_bit(addr, size, offset);
|
|
|
|
if (offset == size)
|
|
|
|
return size;
|
|
|
|
|
|
|
|
offset = round_down(offset, 8);
|
|
|
|
*clump = bitmap_get_value8(addr, offset);
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(find_next_clump8);
|
2022-09-15 10:07:28 +08:00
|
|
|
|
|
|
|
#ifdef __BIG_ENDIAN
|
|
|
|
|
|
|
|
#ifndef find_first_zero_bit_le
|
|
|
|
/*
|
|
|
|
* Find the first cleared bit in an LE memory region.
|
|
|
|
*/
|
|
|
|
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size)
|
|
|
|
{
|
|
|
|
return FIND_FIRST_BIT(~addr[idx], swab, size);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(_find_first_zero_bit_le);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* __BIG_ENDIAN */
|