Now that we have many flavors of find_first_bit(), and expect even more,
it's better to have one macro that generates optimal code for all and makes
maintaining of slightly different functions simpler.
The logic common to all versions is moved to the new macro, and all the
flavors are generated by providing an EXPRESSION macro-parameter, like
in this example:
#define FIND_FIRST_BIT(EXPRESSION, size) ...
find_first_ornot_and_bit(addr1, addr2, addr3, size)
{
return FIND_NEXT_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], size);
}
The EXPRESSION may be of any complexity, as soon as it only refers
the bitmap(s) and an iterator idx.
The 'word_op' is here to allow the macro to generate code for _le
versions on big-endian machines; used in the following patches.
Suggested-by: Linus Torvalds <[email protected]>
Signed-off-by: Yury Norov <[email protected]>
---
MAINTAINERS | 1 +
lib/find_bit.c | 30 +++++-------------------------
lib/find_bit.h | 24 ++++++++++++++++++++++++
3 files changed, 30 insertions(+), 25 deletions(-)
create mode 100644 lib/find_bit.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 96f47a7865d6..02e11f2dbafe 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3612,6 +3612,7 @@ F: include/linux/find.h
F: include/linux/nodemask.h
F: lib/bitmap.c
F: lib/cpumask.c
+F: lib/find_bit.h
F: lib/find_bit.c
F: lib/find_bit_benchmark.c
F: lib/test_bitmap.c
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 1b8e4b2a9cba..ccc4fb1dfc71 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -19,6 +19,8 @@
#include <linux/minmax.h>
#include <linux/swab.h>
+#include "find_bit.h"
+
#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \
!defined(find_next_and_bit)
@@ -77,14 +79,7 @@ EXPORT_SYMBOL(_find_next_bit);
*/
unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
{
- unsigned long idx;
-
- for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
- if (addr[idx])
- return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
- }
-
- return size;
+ return FIND_FIRST_BIT(addr[idx], size);
}
EXPORT_SYMBOL(_find_first_bit);
#endif
@@ -97,15 +92,7 @@ unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2,
unsigned long size)
{
- unsigned long idx, val;
-
- for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
- val = addr1[idx] & addr2[idx];
- if (val)
- return min(idx * BITS_PER_LONG + __ffs(val), size);
- }
-
- return size;
+ return FIND_FIRST_BIT(addr1[idx] & addr2[idx], size);
}
EXPORT_SYMBOL(_find_first_and_bit);
#endif
@@ -116,14 +103,7 @@ EXPORT_SYMBOL(_find_first_and_bit);
*/
unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
- unsigned long idx;
-
- for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
- if (addr[idx] != ~0UL)
- return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
- }
-
- return size;
+ return FIND_FIRST_BIT(~addr[idx], size);
}
EXPORT_SYMBOL(_find_first_zero_bit);
#endif
diff --git a/lib/find_bit.h b/lib/find_bit.h
new file mode 100644
index 000000000000..b4b6245ddbf6
--- /dev/null
+++ b/lib/find_bit.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LIB_FIND_BIT_H
+#define _LIB_FIND_BIT_H
+
+#ifndef word_op
+#define word_op
+#endif
+
+#define FIND_FIRST_BIT(EXPRESSION, size) \
+({ \
+ unsigned long idx, val, sz = (size); \
+ \
+ for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \
+ val = (EXPRESSION); \
+ if (val) { \
+ sz = min(idx * BITS_PER_LONG + __ffs(word_op(val)), sz);\
+ break; \
+ } \
+ } \
+ \
+ sz; \
+})
+
+#endif /* _LIB_FIND_BIT_H */
--
2.34.1
On Wed, Aug 24, 2022 at 4:51 AM Yury Norov <[email protected]> wrote:
>
> Now that we have many flavors of find_first_bit(), and expect even more,
> it's better to have one macro that generates optimal code for all and makes
> maintaining of slightly different functions simpler.
>
> The logic common to all versions is moved to the new macro, and all the
> flavors are generated by providing an EXPRESSION macro-parameter, like
> in this example:
>
> #define FIND_FIRST_BIT(EXPRESSION, size) ...
>
> find_first_ornot_and_bit(addr1, addr2, addr3, size)
> {
> return FIND_NEXT_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], size);
> }
>
> The EXPRESSION may be of any complexity, as soon as it only refers
> the bitmap(s) and an iterator idx.
>
> The 'word_op' is here to allow the macro to generate code for _le
> versions on big-endian machines; used in the following patches.
...
> +#ifndef word_op
> +#define word_op
> +#endif
Not sure about the naming without namespace. Perhaps __ffs_word_op?
> +#define FIND_FIRST_BIT(EXPRESSION, size) \
> +({ \
> + unsigned long idx, val, sz = (size); \
> + \
> + for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \
I think we can do slightly better:
for (unsigned long idx = 0; idx < sz; idx += BITS_PER_LONG) {
unsigned long val;
> + val = (EXPRESSION); \
> + if (val) { \
> + sz = min(idx * BITS_PER_LONG + __ffs(word_op(val)), sz);\
sz = min(idx + __ffs(...));
> + break; \
> + } \
> + } \
> + \
> + sz; \
> +})
--
With Best Regards,
Andy Shevchenko
On Wed, Aug 24, 2022 at 12:10:02PM +0300, Andy Shevchenko wrote:
> On Wed, Aug 24, 2022 at 4:51 AM Yury Norov <[email protected]> wrote:
> >
> > Now that we have many flavors of find_first_bit(), and expect even more,
> > it's better to have one macro that generates optimal code for all and makes
> > maintaining of slightly different functions simpler.
> >
> > The logic common to all versions is moved to the new macro, and all the
> > flavors are generated by providing an EXPRESSION macro-parameter, like
> > in this example:
> >
> > #define FIND_FIRST_BIT(EXPRESSION, size) ...
> >
> > find_first_ornot_and_bit(addr1, addr2, addr3, size)
> > {
> > return FIND_NEXT_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], size);
> > }
> >
> > The EXPRESSION may be of any complexity, as soon as it only refers
> > the bitmap(s) and an iterator idx.
> >
> > The 'word_op' is here to allow the macro to generate code for _le
> > versions on big-endian machines; used in the following patches.
>
> ...
>
> > +#ifndef word_op
> > +#define word_op
> > +#endif
>
> Not sure about the naming without namespace. Perhaps __ffs_word_op?
>
> > +#define FIND_FIRST_BIT(EXPRESSION, size) \
> > +({ \
> > + unsigned long idx, val, sz = (size); \
> > + \
> > + for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \
>
> I think we can do slightly better:
>
> for (unsigned long idx = 0; idx < sz; idx += BITS_PER_LONG) {
> unsigned long val;
This will blow up the EXPRESSION. We can mitigate it on user side:
find_first_bit(addr, size)
{
return FIND_FIRST_BIT(addr[idx/BITS_PER_LONG], size);
}
But to me it's a wtf++.
And generated code looks almost the same, except that
on x86_64 your version is bigger. Compare before:
0000000000000000 <_find_first_bit>:
0: mov %rsi,%rax
3: test %rsi,%rsi
6: je 35 <_find_first_bit+0x35>
8: xor %edx,%edx
a: jmp 19 <_find_first_bit+0x19>
c: add $0x40,%rdx // Track bits and
10: add $0x8,%rdi // index separately
14: cmp %rax,%rdx
17: jae 35 <_find_first_bit+0x35>
19: mov (%rdi),%rcx
1c: test %rcx,%rcx
1f: je c <_find_first_bit+0xc>
21: tzcnt %rcx,%rcx
26: add %rdx,%rcx
29: cmp %rcx,%rax
2c: cmova %rcx,%rax
30: jmp 35 <_find_first_bit+0x35>
35: jmp 3a <_find_first_bit+0x3a>
3a: nopw 0x0(%rax,%rax,1)
And after:
0000000000000000 <_find_first_bit>:
0: mov %rsi,%rax
3: test %rsi,%rsi
6: je 39 <_find_first_bit+0x39>
8: xor %edx,%edx
a: jmp 15 <_find_first_bit+0x15>
c: add $0x40,%rdx // Track bits only
10: cmp %rdx,%rax
13: jbe 39 <_find_first_bit+0x39>
15: mov %rdx,%rcx
18: shr $0x6,%rcx // But divide here
1c: mov (%rdi,%rcx,8),%rcx
20: test %rcx,%rcx
23: je c <_find_first_bit+0xc>
25: tzcnt %rcx,%rcx
2a: add %rcx,%rdx
2d: cmp %rdx,%rax
30: cmova %rdx,%rax
34: jmp 39 <_find_first_bit+0x39>
39: jmp 3e <_find_first_bit+0x3e>
3e: xchg %ax,%ax // Which adds 4 bytes to .text
Thanks,
Yury
> > + val = (EXPRESSION); \
> > + if (val) { \
> > + sz = min(idx * BITS_PER_LONG + __ffs(word_op(val)), sz);\
>
> sz = min(idx + __ffs(...));
>
> > + break; \
> > + } \
> > + } \
> > + \
> > + sz; \
> > +})
>
>
> --
> With Best Regards,
> Andy Shevchenko
...
> And generated code looks almost the same, except that
> on x86_64 your version is bigger. Compare before:
> 0000000000000000 <_find_first_bit>:
> 0: mov %rsi,%rax
> 3: test %rsi,%rsi
> 6: je 35 <_find_first_bit+0x35>
> 8: xor %edx,%edx
> a: jmp 19 <_find_first_bit+0x19>
> c: add $0x40,%rdx // Track bits and
> 10: add $0x8,%rdi // index separately
That add is free - happens in parallel with other instrutcions
> 14: cmp %rax,%rdx
> 17: jae 35 <_find_first_bit+0x35>
The instructions below will (probably/hopefully) be
speculatively executed in parallel with the cmp/jae above
> 19: mov (%rdi),%rcx
> 1c: test %rcx,%rcx
> 1f: je c <_find_first_bit+0xc>
> 21: tzcnt %rcx,%rcx
> 26: add %rdx,%rcx
> 29: cmp %rcx,%rax
> 2c: cmova %rcx,%rax
> 30: jmp 35 <_find_first_bit+0x35>
> 35: jmp 3a <_find_first_bit+0x3a>
> 3a: nopw 0x0(%rax,%rax,1)
>
> And after:
> 0000000000000000 <_find_first_bit>:
> 0: mov %rsi,%rax
> 3: test %rsi,%rsi
> 6: je 39 <_find_first_bit+0x39>
> 8: xor %edx,%edx
> a: jmp 15 <_find_first_bit+0x15>
> c: add $0x40,%rdx // Track bits only
> 10: cmp %rdx,%rax
> 13: jbe 39 <_find_first_bit+0x39>
> 15: mov %rdx,%rcx
> 18: shr $0x6,%rcx // But divide here
> 1c: mov (%rdi,%rcx,8),%rcx
> 20: test %rcx,%rcx
That is a long register dependency chain involving %cx.
It will limit the execution speed to (at least 6) clocks/iteration.
The older version might be 3 clocks/iteration.
So this could easily run at half the speed.
David
> 23: je c <_find_first_bit+0xc>
> 25: tzcnt %rcx,%rcx
> 2a: add %rcx,%rdx
> 2d: cmp %rdx,%rax
> 30: cmova %rdx,%rax
> 34: jmp 39 <_find_first_bit+0x39>
> 39: jmp 3e <_find_first_bit+0x3e>
> 3e: xchg %ax,%ax // Which adds 4 bytes to .text
>
> Thanks,
> Yury
>
> > > + val = (EXPRESSION); \
> > > + if (val) { \
> > > + sz = min(idx * BITS_PER_LONG + __ffs(word_op(val)), sz);\
> >
> > sz = min(idx + __ffs(...));
> >
> > > + break; \
> > > + } \
> > > + } \
> > > + \
> > > + sz; \
> > > +})
> >
> >
> > --
> > With Best Regards,
> > Andy Shevchenko
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Wed, Aug 24, 2022 at 4:19 PM Yury Norov <[email protected]> wrote:
> On Wed, Aug 24, 2022 at 12:10:02PM +0300, Andy Shevchenko wrote:
> > On Wed, Aug 24, 2022 at 4:51 AM Yury Norov <[email protected]> wrote:
...
> > > +#define FIND_FIRST_BIT(EXPRESSION, size) \
> > > +({ \
> > > + unsigned long idx, val, sz = (size); \
> > > + \
> > > + for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \
> >
> > I think we can do slightly better:
> >
> > for (unsigned long idx = 0; idx < sz; idx += BITS_PER_LONG) {
> > unsigned long val;
>
> This will blow up the EXPRESSION. We can mitigate it on user side:
I'm not sure I understand how EXPRESSION is involved in all this. What
I proposed is to replace the for-loop one-by-one to
one-by-BITS_PER_LONG. But okay, I have re-read the above patch and now
I see what you are doing, basically you use internal variables of the
macro in the EXPRESSION. Hmm...
--
With Best Regards,
Andy Shevchenko
On Tue, Aug 23, 2022 at 6:26 PM Yury Norov <[email protected]> wrote:
>
> +
> +#ifndef word_op
> +#define word_op
> +#endif
> +
> +#define FIND_FIRST_BIT(EXPRESSION, size) \
Please just make 'word_op' be an macro argument, the same way 'EXPRESSION' is.
That way the LE/BE cases can be handled without any odd tricks.
Linus