Subject: [PATCH] x86: merge the simple bitops and move them to bitops.h

x86: merge the simple bitops and move them to bitops.h

Some of those can be written in such a way that the same
inline assembly can be used to generate both 32 bit and
64 bit code.

For ffs and fls, x86_64 unconditionally used the cmov
instruction and i386 unconditionally used a conditional
branch over a mov instruction. In the current patch I
chose to select the version based on the availability
of the cmov instruction instead. A small detail here is
that x86_64 did not previously set CONFIG_X86_CMOV=y.

Signed-off-by: Alexander van Heukelum <[email protected]>

---

arch/x86/Kconfig.cpu | 2 +-
include/asm-x86/bitops.h | 90 +++++++++++++++++++++++++++++++++++++++++++
include/asm-x86/bitops_32.h | 64 ------------------------------
include/asm-x86/bitops_64.h | 76 ------------------------------------
4 files changed, 91 insertions(+), 141 deletions(-)

Hi Ingo,

On a 64-bit machine the cmov instruction is always available. I made
that explicit by turning on CONFIG_X86_CMOV for all 64-bit builds.

This patch introduces a change in behaviour for 32-bit builds: if
the selected cpu has the cmov instruction, it will now be used by
the functions ffs and fls.

For the i386 defconfig the number of occurrences of cmov increases
from 4336 to 4429 and vmlinux text size increases 15 bytes. When
building for 486 no cmovs are generated, as expected.

Greetings,
Alexander

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 31e92fb..fb7399b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -399,7 +399,7 @@ config X86_TSC
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+ depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)

config X86_MINIMUM_CPU_FAMILY
int
diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
index 1a23ce1..bbdecee 100644
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -310,6 +310,96 @@ static int test_bit(int nr, const volatile unsigned long *addr);
constant_test_bit((nr),(addr)) : \
variable_test_bit((nr),(addr)))

+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ __asm__("bsf %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+/**
+ * ffz - find first zero in word.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long ffz(unsigned long word)
+{
+ __asm__("bsf %1,%0"
+ :"=r" (word)
+ :"r" (~word));
+ return word;
+}
+
+/*
+ * __fls: find last bit set.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+ __asm__("bsr %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+#ifdef __KERNEL__
+/**
+ * ffs - find first bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as
+ * the libc and compiler builtin ffs routines, therefore
+ * differs in spirit from the above ffz() (man ffs).
+ */
+static inline int ffs(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ __asm__("bsfl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=r" (r) : "rm" (x), "r" (-1));
+#else
+ __asm__("bsfl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r+1;
+}
+
+/**
+ * fls - find last bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs().
+ */
+static inline int fls(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ __asm__("bsrl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "rm" (-1));
+#else
+ __asm__("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r+1;
+}
+#endif /* __KERNEL__ */
+
#undef ADDR

#ifdef CONFIG_X86_32
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index 570f0fa..c19fbe9 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -38,20 +38,6 @@ static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
}

/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
-/**
* find_first_bit - find the first set bit in a memory region
* @addr: The address to start the search at
* @size: The maximum size to search
@@ -72,60 +58,10 @@ static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
return x;
}

-/**
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long ffz(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"r" (~word));
- return word;
-}
-
#ifdef __KERNEL__

#include <asm-generic/bitops/sched.h>

-/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz() (man ffs).
- */
-static inline int ffs(int x)
-{
- int r;
-
- __asm__("bsfl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
- return r+1;
-}
-
-/**
- * fls - find last bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs().
- */
-static inline int fls(int x)
-{
- int r;
-
- __asm__("bsrl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
- return r+1;
-}
-
#include <asm-generic/bitops/hweight.h>

#endif /* __KERNEL__ */
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 87e1a17..866ed56 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -35,71 +35,11 @@ static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
}
}

-/**
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long ffz(unsigned long word)
-{
- __asm__("bsfq %1,%0"
- :"=r" (word)
- :"r" (~word));
- return word;
-}
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfq %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
-/*
- * __fls: find last bit set.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
- __asm__("bsrq %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
#ifdef __KERNEL__

#include <asm-generic/bitops/sched.h>

/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
- */
-static inline int ffs(int x)
-{
- int r;
-
- __asm__("bsfl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=r" (r) : "rm" (x), "r" (-1));
- return r+1;
-}
-
-/**
* fls64 - find last bit set in 64 bit word
* @x: the word to search
*
@@ -112,22 +52,6 @@ static inline int fls64(__u64 x)
return __fls(x) + 1;
}

-/**
- * fls - find last bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs.
- */
-static inline int fls(int x)
-{
- int r;
-
- __asm__("bsrl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "rm" (-1));
- return r+1;
-}
-
#define ARCH_HAS_FAST_MULTIPLIER 1

#include <asm-generic/bitops/hweight.h>


2008-03-14 18:10:12

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

Alexander van Heukelum wrote:
> x86: merge the simple bitops and move them to bitops.h
>
> Some of those can be written in such a way that the same
> inline assembly can be used to generate both 32 bit and
> 64 bit code.
>
> For ffs and fls, x86_64 unconditionally used the cmov
> instruction and i386 unconditionally used a conditional
> branch over a mov instruction. In the current patch I
> chose to select the version based on the availability
> of the cmov instruction instead. A small detail here is
> that x86_64 did not previously set CONFIG_X86_CMOV=y.
>

Looks good in general. What's left in bitops_{32,64}.h now?

(Some comments below.)
> Signed-off-by: Alexander van Heukelum <[email protected]>
>
> ---
>
> arch/x86/Kconfig.cpu | 2 +-
> include/asm-x86/bitops.h | 90 +++++++++++++++++++++++++++++++++++++++++++
> include/asm-x86/bitops_32.h | 64 ------------------------------
> include/asm-x86/bitops_64.h | 76 ------------------------------------
> 4 files changed, 91 insertions(+), 141 deletions(-)
>
> Hi Ingo,
>
> On a 64-bit machine the cmov instruction is always available. I made
> that explicit by turning on CONFIG_X86_CMOV for all 64-bit builds.
>
> This patch introduces a change in behaviour for 32-bit builds: if
> the selected cpu has the cmov instruction, it will now be used by
> the functions ffs and fls.
>
> For the i386 defconfig the number of occurrences of cmov increases
> from 4336 to 4429 and vmlinux text size increases 15 bytes. When
> building for 486 no cmovs are generated, as expected.
>
> Greetings,
> Alexander
>
> diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
> index 31e92fb..fb7399b 100644
> --- a/arch/x86/Kconfig.cpu
> +++ b/arch/x86/Kconfig.cpu
> @@ -399,7 +399,7 @@ config X86_TSC
> # generates cmov.
> config X86_CMOV
> def_bool y
> - depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
> + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)
>
> config X86_MINIMUM_CPU_FAMILY
> int
> diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
> index 1a23ce1..bbdecee 100644
> --- a/include/asm-x86/bitops.h
> +++ b/include/asm-x86/bitops.h
> @@ -310,6 +310,96 @@ static int test_bit(int nr, const volatile unsigned long *addr);
> constant_test_bit((nr),(addr)) : \
> variable_test_bit((nr),(addr)))
>
> +/**
> + * __ffs - find first bit in word.
> + * @word: The word to search
> + *
> + * Undefined if no bit exists, so code should check against 0 first.
> + */
> +static inline unsigned long __ffs(unsigned long word)
> +{
> + __asm__("bsf %1,%0"
> + :"=r" (word)
> + :"rm" (word));
> + return word;
> +}
> +
> +/**
> + * ffz - find first zero in word.
> + * @word: The word to search
> + *
> + * Undefined if no zero exists, so code should check against ~0UL first.
> + */
> +static inline unsigned long ffz(unsigned long word)
> +{
> + __asm__("bsf %1,%0"
> + :"=r" (word)
> + :"r" (~word));
> + return word;
> +}
> +
> +/*
> + * __fls: find last bit set.
> + * @word: The word to search
> + *
> + * Undefined if no zero exists, so code should check against ~0UL first.
> + */
> +static inline unsigned long __fls(unsigned long word)
> +{
> + __asm__("bsr %1,%0"
> + :"=r" (word)
> + :"rm" (word));
> + return word;
> +}
> +
> +#ifdef __KERNEL__
> +/**
> + * ffs - find first bit set
> + * @x: the word to search
> + *
> + * This is defined the same way as
> + * the libc and compiler builtin ffs routines, therefore
> + * differs in spirit from the above ffz() (man ffs).
>

This comment seems wrong. My "man ffs" says that it returns 1-32 for
non-zero inputs, and 0 for a zero input. This function returns 0-31, or
-1 for a zero input.

DESCRIPTION
The ffs() function returns the position of the first (least significant)
bit set in the word i. The least significant bit is position 1 and the
most significant position is, for example, 32 or 64. The functions
ffsll() and ffsl() do the same but take arguments of possibly different
size.

RETURN VALUE
These functions return the position of the first bit set, or 0 if no
bits are set in i.



> + */
> +static inline int ffs(int x)
> +{
> + int r;
> +#ifdef CONFIG_X86_CMOV
> + __asm__("bsfl %1,%0\n\t"
> + "cmovzl %2,%0"
>

The prevailing style in bitops.h is to use "bsf"/"cmovz" and let the
compiler work out the size from the register names.

> + : "=r" (r) : "rm" (x), "r" (-1));
> +#else
> + __asm__("bsfl %1,%0\n\t"
> + "jnz 1f\n\t"
> + "movl $-1,%0\n"
> + "1:" : "=r" (r) : "rm" (x));
> +#endif
> + return r+1;
> +}
> +
> +/**
> + * fls - find last bit set
> + * @x: the word to search
> + *
> + * This is defined the same way as ffs().
>

And this comment is even more wrong, given that ffs and fls are
completely different functions ;)

I know these are from the original, but its worth fixing given that
you're touching it anyway.

> + */
> +static inline int fls(int x)
> +{
> + int r;
> +#ifdef CONFIG_X86_CMOV
> + __asm__("bsrl %1,%0\n\t"
> + "cmovzl %2,%0"
> + : "=&r" (r) : "rm" (x), "rm" (-1));
> +#else
> + __asm__("bsrl %1,%0\n\t"
> + "jnz 1f\n\t"
> + "movl $-1,%0\n"
> + "1:" : "=r" (r) : "rm" (x));
> +#endif
> + return r+1;
> +}
> +#endif /* __KERNEL__ */
> +
> #undef ADDR
>
> #ifdef CONFIG_X86_32

J

2008-03-14 19:45:52

by Alexander van Heukelum

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

On Fri, 14 Mar 2008 11:07:55 -0700, "Jeremy Fitzhardinge"
<[email protected]> said:
> Alexander van Heukelum wrote:
> > x86: merge the simple bitops and move them to bitops.h
> >
> > Some of those can be written in such a way that the same
> > inline assembly can be used to generate both 32 bit and
> > 64 bit code.
> >
> > For ffs and fls, x86_64 unconditionally used the cmov
> > instruction and i386 unconditionally used a conditional
> > branch over a mov instruction. In the current patch I
> > chose to select the version based on the availability
> > of the cmov instruction instead. A small detail here is
> > that x86_64 did not previously set CONFIG_X86_CMOV=y.
> >
>
> Looks good in general. What's left in bitops_{32,64}.h now?

Thanks for taking a look!

bitops_{32,64}.h are getting pretty empty ;)

Both contain find_first_bit/find_first_zero_bit, i386 has them inlined,
x86_64 has an ugly define to select between small bitmaps (inlined) and
an out-of-line version. I think they should be unified much like how
find_next_bit and find_next_zero_bit work now (in x86#testing).

Both define fls64(), but i386 uses a generic one and x86_64 defines
one all by itself. The generic one is currently not suitable for
use by 64-bit archs... that can change.

x86_64 defines ARCH_HAS_FAST_MULTIPLIER, i386 not. This affects a
choice of generated code in the (generic) hweight function. It would
be nice if that could move to some other file.

x86_64 has a mysterious inline function set_bit_string, which is
only used by pci-calgary_64.c and pci-gart_64.c. Not sure what to
do with it.

> (Some comments below.)

--- %< ---

> > +#ifdef __KERNEL__
> > +/**
> > + * ffs - find first bit set
> > + * @x: the word to search
> > + *
> > + * This is defined the same way as
> > + * the libc and compiler builtin ffs routines, therefore
> > + * differs in spirit from the above ffz() (man ffs).
>
> This comment seems wrong. My "man ffs" says that it returns 1-32 for
> non-zero inputs, and 0 for a zero input. This function returns 0-31, or
> -1 for a zero input.

Seems, indeed. You missed the "return r + 1;" ;-)

> > + */
> > +static inline int ffs(int x)
> > +{
> > + int r;
> > +#ifdef CONFIG_X86_CMOV
> > + __asm__("bsfl %1,%0\n\t"
> > + "cmovzl %2,%0"
> >
>
> The prevailing style in bitops.h is to use "bsf"/"cmovz" and let the
> compiler work out the size from the register names.

The comment rightly mentions that ffs differs in spirit from the rest
of the functions here. The type to be used here it always a 4-byte one
(int); most other functions operate on longs. I have not checked if the
explicit size is absolutely necessary, but I feel much safer with the
explicit size there.

> > + : "=r" (r) : "rm" (x), "r" (-1));
> > +#else
> > + __asm__("bsfl %1,%0\n\t"
> > + "jnz 1f\n\t"
> > + "movl $-1,%0\n"
> > + "1:" : "=r" (r) : "rm" (x));
> > +#endif
> > + return r+1;
> > +}
> > +
> > +/**
> > + * fls - find last bit set
> > + * @x: the word to search
> > + *
> > + * This is defined the same way as ffs().
> >
>
> And this comment is even more wrong, given that ffs and fls are
> completely different functions ;)
>
> I know these are from the original, but its worth fixing given that
> you're touching it anyway.

I'll see if I can come up with something better.

Greetings,
Alexander

> > + */
> > +static inline int fls(int x)
> > +{
> > + int r;
> > +#ifdef CONFIG_X86_CMOV
> > + __asm__("bsrl %1,%0\n\t"
> > + "cmovzl %2,%0"
> > + : "=&r" (r) : "rm" (x), "rm" (-1));
> > +#else
> > + __asm__("bsrl %1,%0\n\t"
> > + "jnz 1f\n\t"
> > + "movl $-1,%0\n"
> > + "1:" : "=r" (r) : "rm" (x));
> > +#endif
> > + return r+1;
> > +}
> > +#endif /* __KERNEL__ */
> > +
> > #undef ADDR
> >
> > #ifdef CONFIG_X86_32
>
> J
--
Alexander van Heukelum
[email protected]

--
http://www.fastmail.fm - A no graphics, no pop-ups email service

2008-03-14 19:53:22

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h


The CMOV define should probably be dependent on what CPU the kernel
is tuned for. It was originally written for when x86-64 was only
K8 which has fast CMOV, but e.g. on P4 CMOV is actually deprecated
over jumps.

> Both define fls64(), but i386 uses a generic one and x86_64 defines
> one all by itself. The generic one is currently not suitable for
> use by 64-bit archs... that can change.

It is very unlikely a generic one will ever be able to compete
with a single instruction.

> x86_64 defines ARCH_HAS_FAST_MULTIPLIER, i386 not. This affects a
> choice of generated code in the (generic) hweight function. It would
> be nice if that could move to some other file.

It depends on the CPU, but it can be probably safely set on pretty much
all modern x86 cores.

> x86_64 has a mysterious inline function set_bit_string, which is
> only used by pci-calgary_64.c and pci-gart_64.c. Not sure what to
> do with it.

It's generic and could really live in linux/bitops.h

-Andi

Subject: [PATCH v2] x86: merge the simple bitops and move them to bitops.h

x86: merge the simple bitops and move them to bitops.h.

Some of those can be written in such a way that the same
inline assembly can be used to generate both 32 bit and
64 bit code.

For ffs and fls, x86_64 unconditionally used the cmov
instruction and i386 unconditionally used a conditional
branch over a mov instruction. In the current patch I
chose to select the version based on the availability
of the cmov instruction instead. A small detail here is
that x86_64 did not previously set CONFIG_X86_CMOV=y.

Improved comments for ffs and fls.

Signed-off-by: Alexander van Heukelum <[email protected]>

---

Hi,

Here is a new version with improved (I hope) comments for
the functions ffs and fls, suggested by Jeremy. I left
the explicit use of the -l forms in the inline assembly
of ffs and fls, because I think it is clearer that way.

Greetings,
Alexander

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 31e92fb..fb7399b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -399,7 +399,7 @@ config X86_TSC
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+ depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)

config X86_MINIMUM_CPU_FAMILY
int
diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
index 1a23ce1..363a4ff 100644
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -66,7 +66,6 @@ static inline void __set_bit(int nr, volatile void *addr)
: "Ir" (nr) : "memory");
}

-
/**
* clear_bit - Clears a bit in memory
* @nr: Bit to clear
@@ -310,6 +309,104 @@ static int test_bit(int nr, const volatile unsigned long *addr);
constant_test_bit((nr),(addr)) : \
variable_test_bit((nr),(addr)))

+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ __asm__("bsf %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+/**
+ * ffz - find first zero in word.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long ffz(unsigned long word)
+{
+ __asm__("bsf %1,%0"
+ :"=r" (word)
+ :"r" (~word));
+ return word;
+}
+
+/*
+ * __fls: find last bit set.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+ __asm__("bsr %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+#ifdef __KERNEL__
+/**
+ * ffs - find first bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static inline int ffs(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ __asm__("bsfl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=r" (r) : "rm" (x), "r" (-1));
+#else
+ __asm__("bsfl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+
+/**
+ * fls - find last bit set
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffs, but returns the position of the most significant set bit.
+ *
+ * fls(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 32.
+ */
+static inline int fls(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ __asm__("bsrl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "rm" (-1));
+#else
+ __asm__("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+#endif /* __KERNEL__ */
+
#undef ADDR

#ifdef CONFIG_X86_32
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index 570f0fa..c19fbe9 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -38,20 +38,6 @@ static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
}

/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
-/**
* find_first_bit - find the first set bit in a memory region
* @addr: The address to start the search at
* @size: The maximum size to search
@@ -72,60 +58,10 @@ static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
return x;
}

-/**
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long ffz(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"r" (~word));
- return word;
-}
-
#ifdef __KERNEL__

#include <asm-generic/bitops/sched.h>

-/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz() (man ffs).
- */
-static inline int ffs(int x)
-{
- int r;
-
- __asm__("bsfl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
- return r+1;
-}
-
-/**
- * fls - find last bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs().
- */
-static inline int fls(int x)
-{
- int r;
-
- __asm__("bsrl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
- return r+1;
-}
-
#include <asm-generic/bitops/hweight.h>

#endif /* __KERNEL__ */
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 87e1a17..866ed56 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -35,71 +35,11 @@ static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
}
}

-/**
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long ffz(unsigned long word)
-{
- __asm__("bsfq %1,%0"
- :"=r" (word)
- :"r" (~word));
- return word;
-}
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfq %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
-/*
- * __fls: find last bit set.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
- __asm__("bsrq %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
#ifdef __KERNEL__

#include <asm-generic/bitops/sched.h>

/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
- */
-static inline int ffs(int x)
-{
- int r;
-
- __asm__("bsfl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=r" (r) : "rm" (x), "r" (-1));
- return r+1;
-}
-
-/**
* fls64 - find last bit set in 64 bit word
* @x: the word to search
*
@@ -112,22 +52,6 @@ static inline int fls64(__u64 x)
return __fls(x) + 1;
}

-/**
- * fls - find last bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs.
- */
-static inline int fls(int x)
-{
- int r;
-
- __asm__("bsrl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "rm" (-1));
- return r+1;
-}
-
#define ARCH_HAS_FAST_MULTIPLIER 1

#include <asm-generic/bitops/hweight.h>

2008-03-14 21:17:19

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

Alexander van Heukelum wrote:
> On Fri, 14 Mar 2008 11:07:55 -0700, "Jeremy Fitzhardinge"
> <[email protected]> said:
>
>> Alexander van Heukelum wrote:
>>
>>> x86: merge the simple bitops and move them to bitops.h
>>>
>>> Some of those can be written in such a way that the same
>>> inline assembly can be used to generate both 32 bit and
>>> 64 bit code.
>>>
>>> For ffs and fls, x86_64 unconditionally used the cmov
>>> instruction and i386 unconditionally used a conditional
>>> branch over a mov instruction. In the current patch I
>>> chose to select the version based on the availability
>>> of the cmov instruction instead. A small detail here is
>>> that x86_64 did not previously set CONFIG_X86_CMOV=y.
>>>
>>>
>> Looks good in general. What's left in bitops_{32,64}.h now?
>>
>
> Thanks for taking a look!
>
> bitops_{32,64}.h are getting pretty empty ;)
>
> Both contain find_first_bit/find_first_zero_bit, i386 has them inlined,
> x86_64 has an ugly define to select between small bitmaps (inlined) and
> an out-of-line version. I think they should be unified much like how
> find_next_bit and find_next_zero_bit work now (in x86#testing).
>
> Both define fls64(), but i386 uses a generic one and x86_64 defines
> one all by itself. The generic one is currently not suitable for
> use by 64-bit archs... that can change.
>
> x86_64 defines ARCH_HAS_FAST_MULTIPLIER, i386 not. This affects a
> choice of generated code in the (generic) hweight function. It would
> be nice if that could move to some other file.
>
> x86_64 has a mysterious inline function set_bit_string, which is
> only used by pci-calgary_64.c and pci-gart_64.c. Not sure what to
> do with it.
>
>
>> (Some comments below.)
>>
>
> --- %< ---
>
>
>>> +#ifdef __KERNEL__
>>> +/**
>>> + * ffs - find first bit set
>>> + * @x: the word to search
>>> + *
>>> + * This is defined the same way as
>>> + * the libc and compiler builtin ffs routines, therefore
>>> + * differs in spirit from the above ffz() (man ffs).
>>>
>> This comment seems wrong. My "man ffs" says that it returns 1-32 for
>> non-zero inputs, and 0 for a zero input. This function returns 0-31, or
>> -1 for a zero input.
>>
>
> Seems, indeed. You missed the "return r + 1;" ;-)
>

Indeed I did.

J

2008-03-14 21:33:53

by Alexander van Heukelum

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

On Fri, 14 Mar 2008 20:55:20 +0100, "Andi Kleen" <[email protected]>
said:
>
> The CMOV define should probably be dependent on what CPU the kernel
> is tuned for. It was originally written for when x86-64 was only
> K8 which has fast CMOV, but e.g. on P4 CMOV is actually deprecated
> over jumps.

Hi Andi,

I guess you are right. But there is quite a big number of different
types of P4. Let's see what the current situation is... defconfigs
(of current x86#testing+this patch/current linus) with
CONFIG_GENERIC_CPU=n:

Athlon: 4764 / 4667 occurences of cmovxx
Pentium-IV: 4079 / 3982 occurences of cmovxx
Pentium-M: 3939 / 3841 occurences of cmovxx
Core-2: 4335 / 4330 occurences of cmovxx

So it adds a few percent extra cmovxx's. The last one is fishy...
But I'm too hungry and sleepy to go hunt that one down.

> > Both define fls64(), but i386 uses a generic one and x86_64 defines
> > one all by itself. The generic one is currently not suitable for
> > use by 64-bit archs... that can change.
>
> It is very unlikely a generic one will ever be able to compete
> with a single instruction.

Generic is maybe not the right term: asm-generic/bitops/fls64.h has:

static inline int fls64(__u64 x)
{
__u32 h = x >> 32;
if (h)
return fls(h) + 32;
return fls(x);
}

I just wanted to move the 64-bit version to that header, with some
ifdefs to select the right one.

> > x86_64 defines ARCH_HAS_FAST_MULTIPLIER, i386 not. This affects a
> > choice of generated code in the (generic) hweight function. It would
> > be nice if that could move to some other file.
>
> It depends on the CPU, but it can be probably safely set on pretty much
> all modern x86 cores.

In fact I just found out that it only had an effect for 64 bit
machines. Still, setting it unconditionally feels wrong.

> > x86_64 has a mysterious inline function set_bit_string, which is
> > only used by pci-calgary_64.c and pci-gart_64.c. Not sure what to
> > do with it.
>
> It's generic and could really live in linux/bitops.h

It could. But it is a trivial (slow?) implementation. Probably fine
for the uses in pci-calgary_64.c and pci-gart_64.c (small ranges?),
but I would worry about people using it, thinking it was a near-
optimal implementation.

Greetings,
Alexander
--
Alexander van Heukelum
[email protected]

--
http://www.fastmail.fm - IMAP accessible web-mail

2008-03-14 21:40:05

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

On Fri, Mar 14, 2008 at 10:33:29PM +0100, Alexander van Heukelum wrote:
> static inline int fls64(__u64 x)
> {
> __u32 h = x >> 32;
> if (h)
> return fls(h) + 32;
> return fls(x);
> }
>
> I just wanted to move the 64-bit version to that header, with some
> ifdefs to select the right one.

That's still far more than the single 64bit instruction fls64 uses

> In fact I just found out that it only had an effect for 64 bit
> machines. Still, setting it unconditionally feels wrong.

I don't think your feeling is correct.

>
> > > x86_64 has a mysterious inline function set_bit_string, which is
> > > only used by pci-calgary_64.c and pci-gart_64.c. Not sure what to
> > > do with it.
> >
> > It's generic and could really live in linux/bitops.h
>
> It could. But it is a trivial (slow?) implementation. Probably fine

It is this way because the callers in 95+% of all cases only
set a single bit. For that case it is not slow.

-Andi

2008-03-14 22:01:32

by Alexander van Heukelum

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

On Fri, 14 Mar 2008 22:42:05 +0100, "Andi Kleen" <[email protected]>
said:
> On Fri, Mar 14, 2008 at 10:33:29PM +0100, Alexander van Heukelum wrote:
> > static inline int fls64(__u64 x)
> > {
> > __u32 h = x >> 32;
> > if (h)
> > return fls(h) + 32;
> > return fls(x);
> > }
> >
> > I just wanted to move the 64-bit version to that header, with some
> > ifdefs to select the right one.
>
> That's still far more than the single 64bit instruction fls64 uses

I agree that it should end up using bsr. It would look like this in
the end, I guess. Might be familiar.

#if BITS_PER_LONG == 32
static inline int fls64(__u64 x)
{
__u32 h = x >> 32;
if (h)
return __fls(h) + 33;
return fls(x);
}
#else
static inline int fls64(__u64 x)
{
if (x == 0)
return 0;
return __fls(x) + 1;
}
#endif

> > In fact I just found out that it only had an effect for 64 bit
> > machines. Still, setting it unconditionally feels wrong.
>
> I don't think your feeling is correct.

This is the only reason that this define exists. With another
name it would be fine. HWEIGHT_USE_MULTIPLIER?

> > > > x86_64 has a mysterious inline function set_bit_string, which is
> > > > only used by pci-calgary_64.c and pci-gart_64.c. Not sure what to
> > > > do with it.
> > >
> > > It's generic and could really live in linux/bitops.h
> >
> > It could. But it is a trivial (slow?) implementation. Probably fine
>
> It is this way because the callers in 95+% of all cases only
> set a single bit. For that case it is not slow.

And my feeling is that this is exactly the reason why this is
not a good version for a generic implementation in bitops.h. But
I don't care much.

Greetings,
Alexander

> -Andi
--
Alexander van Heukelum
[email protected]

--
http://www.fastmail.fm - Accessible with your email software
or over the web

2008-03-14 22:16:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h

> #else
> static inline int fls64(__u64 x)
> {
> if (x == 0)
> return 0;
> return __fls(x) + 1;

That would require a polymorphic macro __fls that adapts to 32bit and 64bit
arguments. Not good C style.

> This is the only reason that this define exists. With another
> name it would be fine. HWEIGHT_USE_MULTIPLIER?

AFAIK it only exists because some ancient sparc chips had incredibly
slow multipliers.

> And my feeling is that this is exactly the reason why this is
> not a good version for a generic implementation in bitops.h. But
> I don't care much.

I bet most different approaches who might be slightly
faster for larger bit strings would make the one bit
case slower.

-Andi

2008-03-14 23:32:37

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH v2] x86: merge the simple bitops and move them to bitops.h

On Fri, 14 Mar 2008 21:35:26 +0100 Alexander van Heukelum wrote:

> x86: merge the simple bitops and move them to bitops.h.
>
>
> Improved comments for ffs and fls.

There is still room for some comment improvements IMO. See below.


> Signed-off-by: Alexander van Heukelum <[email protected]>
>
> ---
>
> diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
> index 1a23ce1..363a4ff 100644
> --- a/include/asm-x86/bitops.h
> +++ b/include/asm-x86/bitops.h
> @@ -310,6 +309,104 @@ static int test_bit(int nr, const volatile unsigned long *addr);
> constant_test_bit((nr),(addr)) : \
> variable_test_bit((nr),(addr)))
>
> +/**
> + * __ffs - find first bit in word.

find first bit set in word

[The "first bit in word" could have any value in {0, 1}, so the
return value would depend on which end you considered "first". ;]

> + * @word: The word to search
> + *
> + * Undefined if no bit exists, so code should check against 0 first.
> + */
> +static inline unsigned long __ffs(unsigned long word)
> +{
> + __asm__("bsf %1,%0"
> + :"=r" (word)
> + :"rm" (word));
> + return word;
> +}
> +
> +/**
> + * ffz - find first zero in word.

find first zero bit in word

> + * @word: The word to search
> + *
> + * Undefined if no zero exists, so code should check against ~0UL first.
> + */
> +static inline unsigned long ffz(unsigned long word)
> +{
> + __asm__("bsf %1,%0"
> + :"=r" (word)
> + :"r" (~word));
> + return word;
> +}
> +
> +/*
> + * __fls: find last bit set.

__fls - find last bit set in word

> + * @word: The word to search
> + *
> + * Undefined if no zero exists, so code should check against ~0UL first.
> + */
> +static inline unsigned long __fls(unsigned long word)
> +{
> + __asm__("bsr %1,%0"
> + :"=r" (word)
> + :"rm" (word));
> + return word;
> +}
> +
> +#ifdef __KERNEL__
> +/**
> + * ffs - find first bit set

in word

> + * @x: the word to search
> + *
> + * This is defined the same way as the libc and compiler builtin ffs
> + * routines, therefore differs in spirit from the other bitops.
> + *
> + * ffs(value) returns 0 if value is 0 or the position of the first
> + * set bit if value is nonzero. The first (least significant) bit
> + * is at position 1.
> + */
> +static inline int ffs(int x)
> +{
> + int r;
> +#ifdef CONFIG_X86_CMOV
> + __asm__("bsfl %1,%0\n\t"
> + "cmovzl %2,%0"
> + : "=r" (r) : "rm" (x), "r" (-1));
> +#else
> + __asm__("bsfl %1,%0\n\t"
> + "jnz 1f\n\t"
> + "movl $-1,%0\n"
> + "1:" : "=r" (r) : "rm" (x));
> +#endif
> + return r + 1;
> +}
> +
> +/**
> + * fls - find last bit set

in word

> + * @x: the word to search
> + *
> + * This is defined in a similar way as the libc and compiler builtin
> + * ffs, but returns the position of the most significant set bit.
> + *
> + * fls(value) returns 0 if value is 0 or the position of the last
> + * set bit if value is nonzero. The last (most significant) bit is
> + * at position 32.
> + */
> +static inline int fls(int x)
> +{
> + int r;
> +#ifdef CONFIG_X86_CMOV
> + __asm__("bsrl %1,%0\n\t"
> + "cmovzl %2,%0"
> + : "=&r" (r) : "rm" (x), "rm" (-1));
> +#else
> + __asm__("bsrl %1,%0\n\t"
> + "jnz 1f\n\t"
> + "movl $-1,%0\n"
> + "1:" : "=r" (r) : "rm" (x));
> +#endif
> + return r + 1;
> +}
> +#endif /* __KERNEL__ */


---
~Randy

Subject: [PATCH v3] x86: merge the simple bitops and move them to bitops.h

x86: merge the simple bitops and move them to bitops.h.

Some of those can be written in such a way that the same
inline assembly can be used to generate both 32 bit and
64 bit code.

For ffs and fls, x86_64 unconditionally used the cmov
instruction and i386 unconditionally used a conditional
branch over a mov instruction. In the current patch I
chose to select the version based on the availability
of the cmov instruction instead. A small detail here is
that x86_64 did not previously set CONFIG_X86_CMOV=y.

Improved comments for ffs, ffz, fls and variations.

Signed-off-by: Alexander van Heukelum <[email protected]>

---

Hi Randy, Ingo,

Version 3, for x86#testing. Compared to v1 there are only
changes in the comments as suggested by Jeremy and Randy.

Thanks to both for their comments.

Greetings,
Alexander

arch/x86/Kconfig.cpu | 2 +-
include/asm-x86/bitops.h | 99 ++++++++++++++++++++++++++++++++++++++++++-
include/asm-x86/bitops_32.h | 64 ----------------------------
include/asm-x86/bitops_64.h | 76 ---------------------------------
4 files changed, 99 insertions(+), 142 deletions(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 31e92fb..fb7399b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -399,7 +399,7 @@ config X86_TSC
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+ depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)

config X86_MINIMUM_CPU_FAMILY
int
diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
index 1a23ce1..923bdc2 100644
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -66,7 +66,6 @@ static inline void __set_bit(int nr, volatile void *addr)
: "Ir" (nr) : "memory");
}

-
/**
* clear_bit - Clears a bit in memory
* @nr: Bit to clear
@@ -310,6 +309,104 @@ static int test_bit(int nr, const volatile unsigned long *addr);
constant_test_bit((nr),(addr)) : \
variable_test_bit((nr),(addr)))

+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ __asm__("bsf %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+/**
+ * ffz - find first zero bit in word
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long ffz(unsigned long word)
+{
+ __asm__("bsf %1,%0"
+ :"=r" (word)
+ :"r" (~word));
+ return word;
+}
+
+/*
+ * __fls: find last set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+ __asm__("bsr %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+#ifdef __KERNEL__
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static inline int ffs(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ __asm__("bsfl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=r" (r) : "rm" (x), "r" (-1));
+#else
+ __asm__("bsfl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+
+/**
+ * fls - find last set bit in word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffs, but returns the position of the most significant set bit.
+ *
+ * fls(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 32.
+ */
+static inline int fls(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ __asm__("bsrl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "rm" (-1));
+#else
+ __asm__("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+#endif /* __KERNEL__ */
+
#undef ADDR

#ifdef CONFIG_X86_32
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index 570f0fa..c19fbe9 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -38,20 +38,6 @@ static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
}

/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
-/**
* find_first_bit - find the first set bit in a memory region
* @addr: The address to start the search at
* @size: The maximum size to search
@@ -72,60 +58,10 @@ static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
return x;
}

-/**
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long ffz(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"r" (~word));
- return word;
-}
-
#ifdef __KERNEL__

#include <asm-generic/bitops/sched.h>

-/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz() (man ffs).
- */
-static inline int ffs(int x)
-{
- int r;
-
- __asm__("bsfl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
- return r+1;
-}
-
-/**
- * fls - find last bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs().
- */
-static inline int fls(int x)
-{
- int r;
-
- __asm__("bsrl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
- return r+1;
-}
-
#include <asm-generic/bitops/hweight.h>

#endif /* __KERNEL__ */
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 87e1a17..866ed56 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -35,71 +35,11 @@ static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
}
}

-/**
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long ffz(unsigned long word)
-{
- __asm__("bsfq %1,%0"
- :"=r" (word)
- :"r" (~word));
- return word;
-}
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfq %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
-/*
- * __fls: find last bit set.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
- __asm__("bsrq %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
#ifdef __KERNEL__

#include <asm-generic/bitops/sched.h>

/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
- */
-static inline int ffs(int x)
-{
- int r;
-
- __asm__("bsfl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=r" (r) : "rm" (x), "r" (-1));
- return r+1;
-}
-
-/**
* fls64 - find last bit set in 64 bit word
* @x: the word to search
*
@@ -112,22 +52,6 @@ static inline int fls64(__u64 x)
return __fls(x) + 1;
}

-/**
- * fls - find last bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs.
- */
-static inline int fls(int x)
-{
- int r;
-
- __asm__("bsrl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "rm" (-1));
- return r+1;
-}
-
#define ARCH_HAS_FAST_MULTIPLIER 1

#include <asm-generic/bitops/hweight.h>

2008-03-15 17:54:22

by Alexander van Heukelum

[permalink] [raw]
Subject: Re: [PATCH] x86: merge the simple bitops and move them to bitops.h


On Fri, 14 Mar 2008 23:18:45 +0100, "Andi Kleen" <[email protected]>
said:
> > #else
> > static inline int fls64(__u64 x)
> > {
> > if (x == 0)
> > return 0;
> > return __fls(x) + 1;
>
> That would require a polymorphic macro __fls that adapts to 32bit and
> 64bit arguments. Not good C style.

Hi Andi,

It's unsigned long __fls(unsigned long)... and this is only compiled
if unsigned long is as long as u64. Seems fine to me. Moreover, it
is _exactly_ how it is done in x86_64 now. I must be missing something.

> > This is the only reason that this define exists. With another
> > name it would be fine. HWEIGHT_USE_MULTIPLIER?
>
> AFAIK it only exists because some ancient sparc chips had incredibly
> slow multipliers.

Good to know. And I realized that there is also the machines without
a hardware multiply instruction at all. So you are right. i386/x86_64
should just unconditionally set ARCH_HAS_FAST_MULTIPLIER.

> > And my feeling is that this is exactly the reason why this is
> > not a good version for a generic implementation in bitops.h. But
> > I don't care much.
>
> I bet most different approaches who might be slightly
> faster for larger bit strings would make the one bit
> case slower.

That is true, of course. But then the name of the function should
give a hint that it is optimized for short sequences.

Greetings,
Alexander
--
Alexander van Heukelum
[email protected]

--
http://www.fastmail.fm - Or how I learned to stop worrying and
love email again

Subject: K8, EFFICEON and CORE2 support the cmovxx instructions.

K8, EFFICEON and CORE2 support the cmovxx instructions.

Instead of listing the cpu's that have support for the
cmovxx instructions, list the cpu's that don't.

Signed-off-by: Alexander van Heukelum <[email protected]>
---

A bit of playing resulted in:

CPUS="M386 M486 M586 M586TSC M586MMX M686 MPENTIUMII MPENTIUMIII"
CPUS=$CPUS" MPENTIUMM MPENTIUM4 MK6 MK7 MK8 MCRUSOE MEFFICEON"
CPUS=$CPUS" MWINCHIPC6 MWINCHIP2 MWINCHIP3D MGEODEGX1 MGEODE_LX"
CPUS=$CPUS" MCYRIXIII MVIAC3_2 MVIAC7 MPSC MCORE2"

for cpu in $CPUS
do
echo "CONFIG_${cpu}=y" > testconfig
make ARCH=i386 allnoconfig KCONFIG_ALLCONFIG=testconfig > /dev/null
echo ${cpu} >> result
grep X86_CMOV .config >> result
echo >> result
done

I'm quite sure that K8, EFFICEON and CORE2 support HAVE_CMOV, but
they did not set X86_CMOV.

Greetings,
Alexander van Heukelum

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 31e92fb..7a3a2d4 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -399,7 +399,7 @@ config X86_TSC
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+ depends on !(MCYRIXIII || MGEODE_LX || MGEODEGX1 || MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MK6 || M586MMX || M586TSC || M586 || M486 || M386)

config X86_MINIMUM_CPU_FAMILY
int

2008-03-15 20:22:17

by H. Peter Anvin

[permalink] [raw]
Subject: Re: K8, EFFICEON and CORE2 support the cmovxx instructions.

Alexander van Heukelum wrote:
> K8, EFFICEON and CORE2 support the cmovxx instructions.
>
> Instead of listing the cpu's that have support for the
> cmovxx instructions, list the cpu's that don't.
>
> Signed-off-by: Alexander van Heukelum <[email protected]>
> ---
>
> A bit of playing resulted in:
>
> CPUS="M386 M486 M586 M586TSC M586MMX M686 MPENTIUMII MPENTIUMIII"
> CPUS=$CPUS" MPENTIUMM MPENTIUM4 MK6 MK7 MK8 MCRUSOE MEFFICEON"
> CPUS=$CPUS" MWINCHIPC6 MWINCHIP2 MWINCHIP3D MGEODEGX1 MGEODE_LX"
> CPUS=$CPUS" MCYRIXIII MVIAC3_2 MVIAC7 MPSC MCORE2"
>
> for cpu in $CPUS
> do
> echo "CONFIG_${cpu}=y" > testconfig
> make ARCH=i386 allnoconfig KCONFIG_ALLCONFIG=testconfig > /dev/null
> echo ${cpu} >> result
> grep X86_CMOV .config >> result
> echo >> result
> done
>
> I'm quite sure that K8, EFFICEON and CORE2 support HAVE_CMOV, but
> they did not set X86_CMOV.
>

Crusoe has CMOV as well.

-hpa

2008-03-15 21:06:57

by Alexander van Heukelum

[permalink] [raw]
Subject: Re: K8, EFFICEON and CORE2 support the cmovxx instructions.

On Sat, 15 Mar 2008 21:18:16 +0100, "H. Peter Anvin" <[email protected]>
said:
> Alexander van Heukelum wrote:
> > K8, EFFICEON and CORE2 support the cmovxx instructions.
> >
> > Instead of listing the cpu's that have support for the
> > cmovxx instructions, list the cpu's that don't.
> >
> > Signed-off-by: Alexander van Heukelum <[email protected]>
> > ---
> >
> > A bit of playing resulted in:
> >
> > CPUS="M386 M486 M586 M586TSC M586MMX M686 MPENTIUMII MPENTIUMIII"
> > CPUS=$CPUS" MPENTIUMM MPENTIUM4 MK6 MK7 MK8 MCRUSOE MEFFICEON"
> > CPUS=$CPUS" MWINCHIPC6 MWINCHIP2 MWINCHIP3D MGEODEGX1 MGEODE_LX"
> > CPUS=$CPUS" MCYRIXIII MVIAC3_2 MVIAC7 MPSC MCORE2"
> >
> > for cpu in $CPUS
> > do
> > echo "CONFIG_${cpu}=y" > testconfig
> > make ARCH=i386 allnoconfig KCONFIG_ALLCONFIG=testconfig > /dev/null
> > echo ${cpu} >> result
> > grep X86_CMOV .config >> result
> > echo >> result
> > done
> >
> > I'm quite sure that K8, EFFICEON and CORE2 support HAVE_CMOV, but
> > they did not set X86_CMOV.
> >
>
> Crusoe has CMOV as well.

Hi hpa,

I believe you... and the Makefile_32.cpu: it gets a -march=i686. I
also found out that I missed X86_ELAN, which is treated as a i486.

Also, MGEODE_LX does not get any optimization flags. I think that
is unintentional, but what should they be?

And MPSC (Intel P4 / older Netburst based Xeon) depends on X86_64.
I would be very surprised if that one could not run 32-bit code,
though. What flags should that one get?

Thanks for pointing out I missed Crusoe. As a punishment you
can answer my questions ;).

Greetings,
Alexander
--
Alexander van Heukelum
[email protected]

--
http://www.fastmail.fm - Send your email first class

2008-03-15 21:12:40

by Willy Tarreau

[permalink] [raw]
Subject: Re: K8, EFFICEON and CORE2 support the cmovxx instructions.

On Sat, Mar 15, 2008 at 10:06:32PM +0100, Alexander van Heukelum wrote:
> > > I'm quite sure that K8, EFFICEON and CORE2 support HAVE_CMOV, but
> > > they did not set X86_CMOV.
> > >
> >
> > Crusoe has CMOV as well.

So does GEODE_LX BTW

> I believe you... and the Makefile_32.cpu: it gets a -march=i686. I
> also found out that I missed X86_ELAN, which is treated as a i486.
>
> Also, MGEODE_LX does not get any optimization flags. I think that
> is unintentional, but what should they be?

in my experience (on user-space code), optimizing for "i586" gives
good results. However, the cache is small, so everything which can
reduce code size (especially loop/jump/function alignment) is worth
checking.

Regards,
Willy

Subject: [PATCH] x86: K8, GEODE_LX, CRUSOE, EFFICEON and CORE2 support the cmovxx instructions.

x86: K8, GEODE_LX, CRUSOE, EFFICEON and CORE2 support CMOV.

Instead of summing up the cpu's that have support for the
cmov instruction, sum up the cpu's that don't.

GEODE_LX has no entry in arch/x86/Makefile_32.cpu, so it
did not get any -march or -mtune arguments at all. I quite
arbitrarily added -march=i686 -mtune=pentium-mmx. This way
gcc is allowed to use cmov instructions.

Signed-off-by: Alexander van Heukelum <[email protected]>
---

Willy Tarreau said:
> > > Crusoe has CMOV as well.
>
> So does GEODE_LX BTW

> in my experience (on user-space code), optimizing for "i586" gives
> good results. However, the cache is small, so everything which can
> reduce code size (especially loop/jump/function alignment) is worth
> checking.

Thanks for letting me know.

I chose i686 with pentium-mmx scheduling. This enables cmov, while
the scheduling is still pentium-like. If i586 is really better,
GEODE_LX should be listed as not using cmov at all (because the
compiler does not generate them).

Loop/jump/function alignment is automatically turned off if you
compile the kernel with CONFIG_CC_OPTIMIZE_FOR_SIZE=y so I think
it should not be set explicitly.

Qemu does not run GEODE_LX-kernels (no 3dNow).

Greetings,
Alexander

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 31e92fb..38af3e1 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -399,7 +399,7 @@ config X86_TSC
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+ depends on !(MCYRIXIII || MGEODEGX1 || MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MK6 || M586MMX || M586TSC || M586 || M486 || M386 || X86_ELAN)

config X86_MINIMUM_CPU_FAMILY
int
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b58..b5e2438 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -38,8 +38,9 @@ cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
# AMD Elan support
cflags-$(CONFIG_X86_ELAN) += -march=i486

-# Geode GX1 support
+# Geode GX1 and GX/LX support
cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
+cflags-$(CONFIG_MGEODE_LX) += -march=i686 $(call tune,pentium-mmx)

# add at the end to overwrite eventual tuning options from earlier
# cpu entries

2008-03-21 12:36:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3] x86: merge the simple bitops and move them to bitops.h


* Alexander van Heukelum <[email protected]> wrote:

> x86: merge the simple bitops and move them to bitops.h.
>
> Some of those can be written in such a way that the same inline
> assembly can be used to generate both 32 bit and 64 bit code.
>
> For ffs and fls, x86_64 unconditionally used the cmov instruction and
> i386 unconditionally used a conditional branch over a mov instruction.
> In the current patch I chose to select the version based on the
> availability of the cmov instruction instead. A small detail here is
> that x86_64 did not previously set CONFIG_X86_CMOV=y.
>
> Improved comments for ffs, ffz, fls and variations.

thanks Alexander, applied.

Ingo

2008-03-21 12:38:34

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: K8, GEODE_LX, CRUSOE, EFFICEON and CORE2 support the cmovxx instructions.

* Alexander van Heukelum <[email protected]> wrote:

> x86: K8, GEODE_LX, CRUSOE, EFFICEON and CORE2 support CMOV.
>
> Instead of summing up the cpu's that have support for the
> cmov instruction, sum up the cpu's that don't.
>
> GEODE_LX has no entry in arch/x86/Makefile_32.cpu, so it did not get
> any -march or -mtune arguments at all. I quite arbitrarily added
> -march=i686 -mtune=pentium-mmx. This way gcc is allowed to use cmov
> instructions.

thanks, applied.

Ingo