From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
To: Yury Norov <yury.norov@gmail.com>
Cc: linux@horizon.com, klimov.linux@gmail.com, akpm@linux-foundation.org,
        davem@davemloft.net, dborkman@redhat.com, hannes@stressinduktion.org,
        laijs@cn.fujitsu.com, msalter@redhat.com, takahiro.akashi@linaro.org,
        tgraf@suug.ch, valentinrothberg@gmail.com,
        linux-kernel@vger.kernel.org, chris@chris-wilson.co.uk
Subject: Re: [PATCH v4 1/3] lib: find_*_bit reimplementation
Organization: D03
References: <1423404619-10653-1-git-send-email-yury.norov@gmail.com>
	<1424140543-865-1-git-send-email-yury.norov@gmail.com>
	<1424140543-865-2-git-send-email-yury.norov@gmail.com>
Date: Wed, 18 Feb 2015 18:57:36 +0100
In-Reply-To: <1424140543-865-2-git-send-email-yury.norov@gmail.com> (Yury
	Norov's message of "Tue, 17 Feb 2015 05:35:41 +0300")
Message-ID: <87h9uj14rz.fsf@rasmusvillemoes.dk>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/24.3 (gnu/linux)
MIME-Version: 1.0
Content-Type: text/plain
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7590
Lines: 214

On Tue, Feb 17 2015, Yury Norov <yury.norov@gmail.com> wrote:

> The new implementation takes less space in the sources
> (see diffstat) and in the object. For me it's 710 vs 453
> bytes of text. It also shows a better performance.
>
> find_last_bit description fixed due to obvious typo.
>
> In this patch 2 macros were introduced: {LOW,HIGH}_BITS_MASK,
> that are doing almost the same as BITMAP_{FIRST,LAST}_WORD_MASK
> in include/linux/bitmap.h. But 'LAST' macro is potentially less
> effective, because it issues a conditional branch that can be
> omitted.  If it is replaced one day by a more effective
> implementation, {LOW,HIGH}_BITS_MASK can be removed.
>

I think it's better to use the existing macros and then improve them
instead of duplicating the functionality. I'll submit a patch for that
later tonight (that should then make it to 3.21 [or whatever 3.19+2 will
be called] together with this series). More on this issue below.

>
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
>  include/linux/bitops.h |   4 +-
>  lib/find_last_bit.c    |  37 +++----
>  lib/find_next_bit.c    | 269 ++++++++++++++-----------------------------------
>  3 files changed, 94 insertions(+), 216 deletions(-)
>
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index 5d858e0..297f5bd 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
>  /**
>   * find_last_bit - find the last set bit in a memory region
>   * @addr: The address to start the search at
> - * @size: The maximum size to search
> + * @size: The number of bits to search
>   *
> - * Returns the bit number of the first set bit, or size.
> + * Returns the bit number of the last set bit, or size.
>   */
>  extern unsigned long find_last_bit(const unsigned long *addr,
>  				   unsigned long size);
> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
> index 91ca09f..edbb281 100644
> --- a/lib/find_last_bit.c
> +++ b/lib/find_last_bit.c
> @@ -4,6 +4,9 @@
>   * Written by Rusty Russell <rusty@rustcorp.com.au>
>   * (Inspired by David Howell's find_next_bit implementation)
>   *
> + * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
> + * size and improve performance, 2015.
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -12,36 +15,26 @@
>  
>  #include <linux/bitops.h>
>  #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
> +#include <linux/kernel.h>
> +
> +#define LOW_BITS_MASK(nr) (~0UL >> -(nr))

This is technically wrong, and may not even work on architectures that
are not as forgiving as x86: Whatever type and value nr has, -(nr) is
almost guaranteed not to be a number between 0 and BITS_PER_LONG-1. And
even on x86, gcc doesn't generate as good code as it could:

 163:   49 c7 c0 ff ff ff ff    mov    $0xffffffffffffffff,%r8
 16a:   83 e1 3f                and    $0x3f,%ecx
 16d:   f7 d9                   neg    %ecx
 16f:   48 c1 ea 06             shr    $0x6,%rdx
 173:   49 d3 e8                shr    %cl,%r8

It doesn't realize that pre-masking %ecx with 0x3f is redundant when we
then negate it and use it as a shift amount.

So a better definition of the macro is

#define BITMAP_LAST_WORD_MASK(nr) (~0UL >> (-(nr) & (BITS_PER_LONG-1)))

and then callers shouldn't do the modulo. On x86, gcc knows that the &
is redundant. I use & instead of % so that nr may also have signed type
(otherwise we're again in UB land, since -(nr) % BITS_PER_LONG is then,
by the broken C standard, a negative number).


>  #include <linux/bitops.h>
>  #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
> +#include <linux/kernel.h>
>  
> -#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
> +#define HIGH_BITS_MASK(nr)		(~0UL << (nr))
> +
> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>  
> -#ifndef find_next_bit
>  /*
> - * Find the next set bit in a memory region.
> + * This is a common helper function for find_next_bit and
> + * find_next_zero_bit.  The difference is the "invert" argument, which
> + * is XORed with each fetched word before searching it for one bits.
>   */
> -unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
> -			    unsigned long offset)
> +static unsigned long _find_next_bit(const unsigned long *addr,
> +		unsigned long nbits, unsigned long start, unsigned long invert)
>  {
> -	const unsigned long *p = addr + BITOP_WORD(offset);
> -	unsigned long result = offset & ~(BITS_PER_LONG-1);
>  	unsigned long tmp;
>  
> -	if (offset >= size)
> -		return size;
> -	size -= result;
> -	offset %= BITS_PER_LONG;
> -	if (offset) {
> -		tmp = *(p++);
> -		tmp &= (~0UL << offset);
> -		if (size < BITS_PER_LONG)
> -			goto found_first;
> -		if (tmp)
> -			goto found_middle;
> -		size -= BITS_PER_LONG;
> -		result += BITS_PER_LONG;
> -	}
> -	while (size & ~(BITS_PER_LONG-1)) {
> -		if ((tmp = *(p++)))
> -			goto found_middle;
> -		result += BITS_PER_LONG;
> -		size -= BITS_PER_LONG;
> +	if (!nbits || start >= nbits)
> +		return nbits;
> +
> +	tmp = addr[start / BITS_PER_LONG] ^ invert;
> +
> +	/* Handle 1st word. */
> +	tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);

And of course here, I'd then suggest using BITMAP_FIRST_WORD_MASK(start)
(that even matches the comment :-)), omitting the definition of
HIGH_BITS_MASK.

> @@ -113,24 +78,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
>   */
>  unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
>  {
> -	const unsigned long *p = addr;
> -	unsigned long result = 0;
> -	unsigned long tmp;
> +	unsigned long idx;
>  
> -	while (size & ~(BITS_PER_LONG-1)) {
> -		if ((tmp = *(p++)))
> -			goto found;
> -		result += BITS_PER_LONG;
> -		size -= BITS_PER_LONG;
> +	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> +		if (addr[idx])
> +			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
>  	}
> -	if (!size)
> -		return result;
>  
> -	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
> -	if (tmp == 0UL)		/* Are any bits set? */
> -		return result + size;	/* Nope. */
> -found:
> -	return result + __ffs(tmp);
> +	return size;
>  }
>  EXPORT_SYMBOL(find_first_bit);
>  #endif
> @@ -141,24 +96,14 @@ EXPORT_SYMBOL(find_first_bit);
>   */
>  unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
>  {
> -	const unsigned long *p = addr;
> -	unsigned long result = 0;
> -	unsigned long tmp;
> +	unsigned long idx;
>  
> -	while (size & ~(BITS_PER_LONG-1)) {
> -		if (~(tmp = *(p++)))
> -			goto found;
> -		result += BITS_PER_LONG;
> -		size -= BITS_PER_LONG;
> +	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> +		if (addr[idx] != ~0UL)
> +			return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
>  	}

Since I'm afraid the above means I have to ask you to send a v5, I might
as well also comment on this: I think it would make the code much more
obviously parallel to find_first_bit if the test was "if (~addr[idx])"
and the ffz is then replaced by __ffs(~addr[idx]). Many architectures
implement ffz(x) as __ffs(~x) anyway, so it shouldn't be any less
efficient. But it's no big deal, so if you feel this is better, just
leave it.

Rasmus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/