2017-12-11 05:38:08

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] apparent big-endian bugs in dwc-xlgmac

On Mon, Dec 11, 2017 at 05:05:20AM +0000, Al Viro wrote:

> What for? Sure, this variant will work, but why bother with
> a = le32_to_cpu(b);
> (cpu_to_le32(a) & ....) | ....
> and how is that better than
> (b & ...) | ...
>
> IDGI... Mind you, I'm not sure if there is any point keeping _var in that thing,
> seeing that we use var only once - might be better off with
> ((var) & ~cpu_to_le32(GENMASK(_pos + _len - 1, _pos))) | \
> cpu_to_le32(_val); \

FWIW, seeing how many drivers end up open-coding that, I'm rather tempted to
add to linux/bitops.h or linux/byteorder/generic.h the following:

static inline __le16 le16_replace_bits(__le16 old, u16 v, int bit, int size)
{
__le16 mask = cpu_to_le16(GENMASK(bit + size - 1, bit));
return (old & ~mask) | (cpu_to_le16(v << bit) & mask);
}

static inline __le32 le32_replace_bits(__le32 old, u32 v, int bit, int size)
{
__le32 mask = cpu_to_le32(GENMASK(bit + size - 1, bit));
return (old & ~mask) | (cpu_to_le32(v << bit) & mask);
}

static inline __le64 le64_replace_bits(__le64 old, u64 v, int bit, int size)
{
__le64 mask = cpu_to_le64(GENMASK_ULL(bit + size - 1, bit));
return (old & ~mask) | (cpu_to_le64(v << bit) & mask);
}

static inline __be16 be16_replace_bits(__be16 old, u16 v, int bit, int size)
{
__be16 mask = cpu_to_be16(GENMASK(bit + size - 1, bit));
return (old & ~mask) | (cpu_to_be16(v << bit) & mask);
}

static inline __be32 be32_replace_bits(__be32 old, u32 v, int bit, int size)
{
__be32 mask = cpu_to_be32(GENMASK(bit + size - 1, bit));
return (old & ~mask) | (cpu_to_be32(v << bit) & mask);
}

static inline __be64 be64_replace_bits(__be64 old, u64 v, int bit, int size)
{
__be64 mask = cpu_to_be64(GENMASK_ULL(bit + size - 1, bit));
return (old & ~mask) | (cpu_to_be64(v << bit) & mask);
}

static inline u16 le16_get_bits(__le16 v, int bit, int size)
{
return (le16_to_cpu(v) >> bit) & (BIT(size) - 1);
}

static inline u32 le32_get_bits(__le32 v, int bit, int size)
{
return (le32_to_cpu(v) >> bit) & (BIT(size) - 1);
}

static inline u64 le64_get_bits(__le64 v, int bit, int size)
{
return (le64_to_cpu(v) >> bit) & (BIT_ULL(size) - 1);
}

static inline u16 be16_get_bits(__be16 v, int bit, int size)
{
return (be16_to_cpu(v) >> bit) & (BIT(size) - 1);
}

static inline u32 be32_get_bits(__be32 v, int bit, int size)
{
return (be32_to_cpu(v) >> bit) & (BIT(size) - 1);
}

static inline u64 be64_get_bits(__be64 v, int bit, int size)
{
return (be64_to_cpu(v) >> bit) & (BIT_ULL(size) - 1);
}

and let drivers use those...


2017-12-11 06:46:56

by Jie Deng

[permalink] [raw]
Subject: Re: [RFC][PATCH] apparent big-endian bugs in dwc-xlgmac

On 2017/12/11 13:38, Al Viro wrote:

> On Mon, Dec 11, 2017 at 05:05:20AM +0000, Al Viro wrote:
>
>> What for? Sure, this variant will work, but why bother with
>> a = le32_to_cpu(b);
>> (cpu_to_le32(a) & ....) | ....
>> and how is that better than
>> (b & ...) | ...
>>
>> IDGI... Mind you, I'm not sure if there is any point keeping _var in that thing,
>> seeing that we use var only once - might be better off with
>> ((var) & ~cpu_to_le32(GENMASK(_pos + _len - 1, _pos))) | \
>> cpu_to_le32(_val); \
> FWIW, seeing how many drivers end up open-coding that, I'm rather tempted to
> add to linux/bitops.h or linux/byteorder/generic.h the following:
>
> static inline __le16 le16_replace_bits(__le16 old, u16 v, int bit, int size)
> {
> __le16 mask = cpu_to_le16(GENMASK(bit + size - 1, bit));
> return (old & ~mask) | (cpu_to_le16(v << bit) & mask);
> }
>
> static inline __le32 le32_replace_bits(__le32 old, u32 v, int bit, int size)
> {
> __le32 mask = cpu_to_le32(GENMASK(bit + size - 1, bit));
> return (old & ~mask) | (cpu_to_le32(v << bit) & mask);
> }
>
> static inline __le64 le64_replace_bits(__le64 old, u64 v, int bit, int size)
> {
> __le64 mask = cpu_to_le64(GENMASK_ULL(bit + size - 1, bit));
> return (old & ~mask) | (cpu_to_le64(v << bit) & mask);
> }
>
> static inline __be16 be16_replace_bits(__be16 old, u16 v, int bit, int size)
> {
> __be16 mask = cpu_to_be16(GENMASK(bit + size - 1, bit));
> return (old & ~mask) | (cpu_to_be16(v << bit) & mask);
> }
>
> static inline __be32 be32_replace_bits(__be32 old, u32 v, int bit, int size)
> {
> __be32 mask = cpu_to_be32(GENMASK(bit + size - 1, bit));
> return (old & ~mask) | (cpu_to_be32(v << bit) & mask);
> }
>
> static inline __be64 be64_replace_bits(__be64 old, u64 v, int bit, int size)
> {
> __be64 mask = cpu_to_be64(GENMASK_ULL(bit + size - 1, bit));
> return (old & ~mask) | (cpu_to_be64(v << bit) & mask);
> }
>
> static inline u16 le16_get_bits(__le16 v, int bit, int size)
> {
> return (le16_to_cpu(v) >> bit) & (BIT(size) - 1);
> }
>
> static inline u32 le32_get_bits(__le32 v, int bit, int size)
> {
> return (le32_to_cpu(v) >> bit) & (BIT(size) - 1);
> }
>
> static inline u64 le64_get_bits(__le64 v, int bit, int size)
> {
> return (le64_to_cpu(v) >> bit) & (BIT_ULL(size) - 1);
> }
>
> static inline u16 be16_get_bits(__be16 v, int bit, int size)
> {
> return (be16_to_cpu(v) >> bit) & (BIT(size) - 1);
> }
>
> static inline u32 be32_get_bits(__be32 v, int bit, int size)
> {
> return (be32_to_cpu(v) >> bit) & (BIT(size) - 1);
> }
>
> static inline u64 be64_get_bits(__be64 v, int bit, int size)
> {
> return (be64_to_cpu(v) >> bit) & (BIT_ULL(size) - 1);
> }
>
> and let drivers use those...
Sounds good. As a driver developer, I'm happy to see this.

2017-12-11 15:54:25

by Al Viro

[permalink] [raw]
Subject: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

A lot of drivers are open-coding the "replace these bits in __be32 with
the following value" kind of primitives. Let's add them to byteorder.h.

Primitives:
{be,le}{16,32,64}_replace_bits(old, v, bit, nbits)
{be,le}{16,32,64}_get_bits(val, bit, nbits)

Essentially, it gives helpers for work with bitfields in fixed-endian.
Suppose we have e.g. a little-endian 32bit value with fixed layout;
expressing that as a bitfield would go like
struct foo {
unsigned foo:4; /* bits 0..3 */
unsigned :2;
unsigned bar:12; /* bits 6..17 */
unsigned baz:14; /* bits 18..31 */
}
Even for host-endian it doesn't work all that well - you end up with
ifdefs in structure definition and generated code stinks. For fixed-endian
it gets really painful, and people tend to use explicit shift-and-mask
kind of macros for accessing the fields (and often enough get the
endianness conversions wrong, at that). With these primitives

struct foo v <=> __le32 v
v.foo = i ? 1 : 2 <=> v = le32_replace_bits(v, i ? 1 : 2, 0, 4)
f(4 + v.baz) <=> f(4 + le32_get_bits(v, 18, 14))

Signed-off-by: Al Viro <[email protected]>
---
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 451aaa0786ae..d8f169a7104a 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -187,4 +187,26 @@ static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
dst[i] = be32_to_cpu(src[i]);
}

+#define ____MASK(bit, nbits) ((((1ULL << ((nbits) - 1)) << 1) - 1) << (bit))
+#define ____MAKE_OP(type,base) \
+static inline __##type type##_replace_bits(__##type old, \
+ base val, int bit, int nbits) \
+{ \
+ __##type mask = cpu_to_##type(____MASK(bit, nbits)); \
+ return (old & ~mask) | (cpu_to_##type(val << bit) & mask); \
+} \
+static inline base type##_get_bits(__##type val, int bit, int nbits) \
+{ \
+ return (type##_to_cpu(val) >> bit) & ____MASK(0, nbits); \
+}
+
+____MAKE_OP(le16,u16)
+____MAKE_OP(le32,u32)
+____MAKE_OP(le64,u64)
+____MAKE_OP(be16,u16)
+____MAKE_OP(be32,u32)
+____MAKE_OP(be64,u64)
+#undef ____MAKE_OP
+#undef ____MASK
+
#endif /* _LINUX_BYTEORDER_GENERIC_H */

2017-12-12 04:02:34

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Mon, 11 Dec 2017 15:54:22 +0000, Al Viro wrote:
> Essentially, it gives helpers for work with bitfields in fixed-endian.
> Suppose we have e.g. a little-endian 32bit value with fixed layout;
> expressing that as a bitfield would go like
> struct foo {
> unsigned foo:4; /* bits 0..3 */
> unsigned :2;
> unsigned bar:12; /* bits 6..17 */
> unsigned baz:14; /* bits 18..31 */
> }
> Even for host-endian it doesn't work all that well - you end up with
> ifdefs in structure definition and generated code stinks. For fixed-endian
> it gets really painful, and people tend to use explicit shift-and-mask
> kind of macros for accessing the fields (and often enough get the
> endianness conversions wrong, at that). With these primitives
>
> struct foo v <=> __le32 v
> v.foo = i ? 1 : 2 <=> v = le32_replace_bits(v, i ? 1 : 2, 0, 4)
> f(4 + v.baz) <=> f(4 + le32_get_bits(v, 18, 14))

Looks very useful. The [start bit, size] pair may not land itself
too nicely to creating defines, though. Which is why in
include/linux/bitfield.h we tried to use a shifted mask and work
backwards from that single value what the start and size are. commit
3e9b3112ec74 ("add basic register-field manipulation macros") has the
description. Could a similar trick perhaps be applicable here?

2017-12-12 06:20:08

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Mon, Dec 11, 2017 at 08:02:24PM -0800, Jakub Kicinski wrote:
> On Mon, 11 Dec 2017 15:54:22 +0000, Al Viro wrote:
> > Essentially, it gives helpers for work with bitfields in fixed-endian.
> > Suppose we have e.g. a little-endian 32bit value with fixed layout;
> > expressing that as a bitfield would go like
> > struct foo {
> > unsigned foo:4; /* bits 0..3 */
> > unsigned :2;
> > unsigned bar:12; /* bits 6..17 */
> > unsigned baz:14; /* bits 18..31 */
> > }
> > Even for host-endian it doesn't work all that well - you end up with
> > ifdefs in structure definition and generated code stinks. For fixed-endian
> > it gets really painful, and people tend to use explicit shift-and-mask
> > kind of macros for accessing the fields (and often enough get the
> > endianness conversions wrong, at that). With these primitives
> >
> > struct foo v <=> __le32 v
> > v.foo = i ? 1 : 2 <=> v = le32_replace_bits(v, i ? 1 : 2, 0, 4)
> > f(4 + v.baz) <=> f(4 + le32_get_bits(v, 18, 14))
>
> Looks very useful. The [start bit, size] pair may not land itself
> too nicely to creating defines, though. Which is why in
> include/linux/bitfield.h we tried to use a shifted mask and work
> backwards from that single value what the start and size are. commit
> 3e9b3112ec74 ("add basic register-field manipulation macros") has the
> description. Could a similar trick perhaps be applicable here?

Umm... What's wrong with

#define FIELD_FOO 0,4
#define FIELD_BAR 6,12
#define FIELD_BAZ 18,14

A macro can bloody well expand to any sequence of tokens - le32_get_bits(v, FIELD_BAZ)
will become le32_get_bits(v, 18, 14) just fine. What's the problem with that?

2017-12-12 19:45:36

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, Dec 12, 2017 at 06:20:02AM +0000, Al Viro wrote:

> Umm... What's wrong with
>
> #define FIELD_FOO 0,4
> #define FIELD_BAR 6,12
> #define FIELD_BAZ 18,14
>
> A macro can bloody well expand to any sequence of tokens - le32_get_bits(v, FIELD_BAZ)
> will become le32_get_bits(v, 18, 14) just fine. What's the problem with that?

FWIW, if you want to use the mask, __builtin_ffsll() is not the only way to do
it - you don't need the shift. Multiplier would do just as well, and that can
be had easier. If mask = (2*a + 1)<<n = ((2*a)<<n) ^ (1<<n), then
mask - 1 = ((2*a) << n) + ((1<<n) - 1) = ((2*n) << n) ^ ((1<<n) - 1)
mask ^ (mask - 1) = (1<<n) + ((1<<n) - 1)
and
mask & (mask ^ (mask - 1)) = 1<<n.

IOW, with

static __always_inline u64 mask_to_multiplier(u64 mask)
{
return mask & (mask ^ (mask - 1));
}

we could do

static __always_inline __le64 le64_replace_bits(__le64 old, u64 v, u64 mask)
{
__le64 m = cpu_to_le64(mask);
return (old & ~m) | (cpu_to_le64(v * mask_to_multiplier(mask)) & m);
}

static __always_inline u64 le64_get_bits(__le64 v, u64 mask)
{
return (le64_to_cpu(v) & mask) / mask_to_multiplier(mask);
}

etc. Compiler will turn those into shifts... I can live with either calling
conventions.

Comments?

2017-12-12 20:04:18

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, 12 Dec 2017 19:45:32 +0000, Al Viro wrote:
> On Tue, Dec 12, 2017 at 06:20:02AM +0000, Al Viro wrote:
>
> > Umm... What's wrong with
> >
> > #define FIELD_FOO 0,4
> > #define FIELD_BAR 6,12
> > #define FIELD_BAZ 18,14
> >
> > A macro can bloody well expand to any sequence of tokens - le32_get_bits(v, FIELD_BAZ)
> > will become le32_get_bits(v, 18, 14) just fine. What's the problem with that?
>
> FWIW, if you want to use the mask, __builtin_ffsll() is not the only way to do
> it - you don't need the shift. Multiplier would do just as well, and that can
> be had easier. If mask = (2*a + 1)<<n = ((2*a)<<n) ^ (1<<n), then
> mask - 1 = ((2*a) << n) + ((1<<n) - 1) = ((2*n) << n) ^ ((1<<n) - 1)
> mask ^ (mask - 1) = (1<<n) + ((1<<n) - 1)
> and
> mask & (mask ^ (mask - 1)) = 1<<n.
>
> IOW, with
>
> static __always_inline u64 mask_to_multiplier(u64 mask)
> {
> return mask & (mask ^ (mask - 1));
> }
>
> we could do
>
> static __always_inline __le64 le64_replace_bits(__le64 old, u64 v, u64 mask)
> {
> __le64 m = cpu_to_le64(mask);
> return (old & ~m) | (cpu_to_le64(v * mask_to_multiplier(mask)) & m);
> }
>
> static __always_inline u64 le64_get_bits(__le64 v, u64 mask)
> {
> return (le64_to_cpu(v) & mask) / mask_to_multiplier(mask);
> }
>
> etc. Compiler will turn those into shifts... I can live with either calling
> conventions.
>
> Comments?

Very nice! The compilation-time check if the value can fit in a field
covered by the mask (if they're both known) did help me catch bugs
early a few times over the years, so if it could be preserved we can
maybe even drop the FIELD_* macros and just use this approach?

2017-12-12 23:49:03

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, Dec 12, 2017 at 12:04:09PM -0800, Jakub Kicinski wrote:

> > static __always_inline u64 mask_to_multiplier(u64 mask)
> > {
> > return mask & (mask ^ (mask - 1));
> > }

D'oh. Even simpler than that, of course -

static __always_inline u64 mask_to_multiplier(u64 mask)
{
return mask & -mask;
}

> Very nice! The compilation-time check if the value can fit in a field
> covered by the mask (if they're both known) did help me catch bugs
> early a few times over the years, so if it could be preserved we can
> maybe even drop the FIELD_* macros and just use this approach?

Umm... Something like this, perhaps? Same bunch, plus u{16,32,64}_...
variants for host-endian. Adding sanity check on mask is also not
hard, but I don't know how useful it actually is...

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 451aaa0786ae..a032de9aa03d 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -187,4 +187,36 @@ static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
dst[i] = be32_to_cpu(src[i]);
}

+extern void __compiletime_error("value doesn't fit into mask")
+__field_overflow(void);
+static __always_inline u64 mask_to_multiplier(u64 mask)
+{
+ return mask & -mask;
+}
+
+#define ____MAKE_OP(type,base,to,from) \
+static __always_inline __##type type##_replace_bits(__##type old, \
+ base val, base mask) \
+{ \
+ __##type m = to(mask); \
+ if (__builtin_constant_p(val) && \
+ (val & ~(mask/mask_to_multiplier(mask)))) \
+ __field_overflow(); \
+ return (old & ~m) | \
+ (to(val * mask_to_multiplier(mask)) & m); \
+} \
+static __always_inline base type##_get_bits(__##type v, base mask) \
+{ \
+ return (from(v) & mask)/mask_to_multiplier(mask); \
+}
+#define __MAKE_OP(size) \
+ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
+ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \
+ ____MAKE_OP(u##size,u##size,,)
+__MAKE_OP(16)
+__MAKE_OP(32)
+__MAKE_OP(64)
+#undef __MAKE_OP
+#undef ____MAKE_OP
+
#endif /* _LINUX_BYTEORDER_GENERIC_H */

2017-12-13 00:00:05

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, 12 Dec 2017 23:48:56 +0000, Al Viro wrote:
> On Tue, Dec 12, 2017 at 12:04:09PM -0800, Jakub Kicinski wrote:
>
> > > static __always_inline u64 mask_to_multiplier(u64 mask)
> > > {
> > > return mask & (mask ^ (mask - 1));
> > > }
>
> D'oh. Even simpler than that, of course -
>
> static __always_inline u64 mask_to_multiplier(u64 mask)
> {
> return mask & -mask;
> }
>
> > Very nice! The compilation-time check if the value can fit in a field
> > covered by the mask (if they're both known) did help me catch bugs
> > early a few times over the years, so if it could be preserved we can
> > maybe even drop the FIELD_* macros and just use this approach?
>
> Umm... Something like this, perhaps? Same bunch, plus u{16,32,64}_...
> variants for host-endian. Adding sanity check on mask is also not
> hard, but I don't know how useful it actually is...

Sanity checking the mask is not that useful in my experience.

> diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
> index 451aaa0786ae..a032de9aa03d 100644
> --- a/include/linux/byteorder/generic.h
> +++ b/include/linux/byteorder/generic.h
> @@ -187,4 +187,36 @@ static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
> dst[i] = be32_to_cpu(src[i]);
> }
>
> +extern void __compiletime_error("value doesn't fit into mask")
> +__field_overflow(void);
> +static __always_inline u64 mask_to_multiplier(u64 mask)
> +{
> + return mask & -mask;
> +}
> +
> +#define ____MAKE_OP(type,base,to,from) \
> +static __always_inline __##type type##_replace_bits(__##type old, \
> + base val, base mask) \
> +{ \
> + __##type m = to(mask); \
> + if (__builtin_constant_p(val) && \

Is the lack of a __builtin_constant_p(mask) test intentional? Sometimes
the bitfield is a packed array and people may have a helper to which
only the mask is passed as non-constant and the value is implied by the
helper, thus constant.

> + (val & ~(mask/mask_to_multiplier(mask)))) \
> + __field_overflow(); \
> + return (old & ~m) | \
> + (to(val * mask_to_multiplier(mask)) & m); \
> +} \
> +static __always_inline base type##_get_bits(__##type v, base mask) \
> +{ \
> + return (from(v) & mask)/mask_to_multiplier(mask); \
> +}
> +#define __MAKE_OP(size) \
> + ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
> + ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \
> + ____MAKE_OP(u##size,u##size,,)
> +__MAKE_OP(16)
> +__MAKE_OP(32)
> +__MAKE_OP(64)
> +#undef __MAKE_OP
> +#undef ____MAKE_OP
> +
> #endif /* _LINUX_BYTEORDER_GENERIC_H */

2017-12-13 00:37:09

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, Dec 12, 2017 at 03:59:33PM -0800, Jakub Kicinski wrote:
> > +static __always_inline __##type type##_replace_bits(__##type old, \
> > + base val, base mask) \
> > +{ \
> > + __##type m = to(mask); \
> > + if (__builtin_constant_p(val) && \
>
> Is the lack of a __builtin_constant_p(mask) test intentional? Sometimes
> the bitfield is a packed array and people may have a helper to which
> only the mask is passed as non-constant and the value is implied by the
> helper, thus constant.

If the mask in non-constant, we probably shouldn't be using that at all;
could you show a real-world example where that would be the case?

2017-12-13 01:04:48

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Wed, 13 Dec 2017 00:36:59 +0000, Al Viro wrote:
> On Tue, Dec 12, 2017 at 03:59:33PM -0800, Jakub Kicinski wrote:
> > > +static __always_inline __##type type##_replace_bits(__##type old, \
> > > + base val, base mask) \
> > > +{ \
> > > + __##type m = to(mask); \
> > > + if (__builtin_constant_p(val) && \
> >
> > Is the lack of a __builtin_constant_p(mask) test intentional? Sometimes
> > the bitfield is a packed array and people may have a helper to which
> > only the mask is passed as non-constant and the value is implied by the
> > helper, thus constant.
>
> If the mask in non-constant, we probably shouldn't be using that at all;
> could you show a real-world example where that would be the case?

FIELD_* macros explicitly forbid this, since the code would be...
suboptimal with the runtime ffsl. Real life examples are the hackish
macro NFP_ETH_SET_BIT_CONFIG() in

drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c

I remember there was also some Renesas code.. maybe this:

https://patchwork.kernel.org/patch/9881279/

it looks like cpg_z_clk_recalc_rate() and cpg_z2_clk_recalc_rate() only
differ in mask.

2017-12-13 01:30:59

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, Dec 12, 2017 at 05:04:37PM -0800, Jakub Kicinski wrote:
> On Wed, 13 Dec 2017 00:36:59 +0000, Al Viro wrote:
> > On Tue, Dec 12, 2017 at 03:59:33PM -0800, Jakub Kicinski wrote:
> > > > +static __always_inline __##type type##_replace_bits(__##type old, \
> > > > + base val, base mask) \
> > > > +{ \
> > > > + __##type m = to(mask); \
> > > > + if (__builtin_constant_p(val) && \
> > >
> > > Is the lack of a __builtin_constant_p(mask) test intentional? Sometimes
> > > the bitfield is a packed array and people may have a helper to which
> > > only the mask is passed as non-constant and the value is implied by the
> > > helper, thus constant.
> >
> > If the mask in non-constant, we probably shouldn't be using that at all;
> > could you show a real-world example where that would be the case?
>
> FIELD_* macros explicitly forbid this, since the code would be...
> suboptimal with the runtime ffsl. Real life examples are the hackish
> macro NFP_ETH_SET_BIT_CONFIG() in
>
> drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c

Why not simply make nfp_eth_set_bit_config() static inline and be
done with that? It's not that large and there are only few callers,
so...

> I remember there was also some Renesas code.. maybe this:
>
> https://patchwork.kernel.org/patch/9881279/
>
> it looks like cpg_z_clk_recalc_rate() and cpg_z2_clk_recalc_rate() only
> differ in mask.

*shrug*

That thing would expand into "reg &= 15" in one case and "reg >>= 8; reg &= 15"
in another. Either of which is cheaper than a function call, and definitely
cheaper than any kind of dynamic calculation of shift, no matter how implemented.

2017-12-13 01:35:38

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Wed, 13 Dec 2017 01:30:56 +0000, Al Viro wrote:
> On Tue, Dec 12, 2017 at 05:04:37PM -0800, Jakub Kicinski wrote:
> > On Wed, 13 Dec 2017 00:36:59 +0000, Al Viro wrote:
> > > On Tue, Dec 12, 2017 at 03:59:33PM -0800, Jakub Kicinski wrote:
> > > > > +static __always_inline __##type type##_replace_bits(__##type old, \
> > > > > + base val, base mask) \
> > > > > +{ \
> > > > > + __##type m = to(mask); \
> > > > > + if (__builtin_constant_p(val) && \
> > > >
> > > > Is the lack of a __builtin_constant_p(mask) test intentional? Sometimes
> > > > the bitfield is a packed array and people may have a helper to which
> > > > only the mask is passed as non-constant and the value is implied by the
> > > > helper, thus constant.
> > >
> > > If the mask in non-constant, we probably shouldn't be using that at all;
> > > could you show a real-world example where that would be the case?
> >
> > FIELD_* macros explicitly forbid this, since the code would be...
> > suboptimal with the runtime ffsl. Real life examples are the hackish
> > macro NFP_ETH_SET_BIT_CONFIG() in
> >
> > drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
>
> Why not simply make nfp_eth_set_bit_config() static inline and be
> done with that? It's not that large and there are only few callers,
> so...

It used to be __always_inline, but apparently LLVM/clang doesn't
propagate constants :(

4e59532541c8 ("nfp: don't depend on compiler constant propagation")

> > I remember there was also some Renesas code.. maybe this:
> >
> > https://patchwork.kernel.org/patch/9881279/
> >
> > it looks like cpg_z_clk_recalc_rate() and cpg_z2_clk_recalc_rate() only
> > differ in mask.
>
> *shrug*
>
> That thing would expand into "reg &= 15" in one case and "reg >>= 8; reg &= 15"
> in another. Either of which is cheaper than a function call, and definitely
> cheaper than any kind of dynamic calculation of shift, no matter how implemented.

2017-12-13 01:51:31

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, Dec 12, 2017 at 05:35:28PM -0800, Jakub Kicinski wrote:

> It used to be __always_inline, but apparently LLVM/clang doesn't
> propagate constants :(
>
> 4e59532541c8 ("nfp: don't depend on compiler constant propagation")

Doesn't propagate constants or doesn't have exact same set of
rules for __builtin_constant_p()? IOW, if you dropped that
BUILD_BUG_ON(), what would be left after optimizations?

2017-12-13 02:44:34

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Wed, 13 Dec 2017 01:51:25 +0000, Al Viro wrote:
> On Tue, Dec 12, 2017 at 05:35:28PM -0800, Jakub Kicinski wrote:
>
> > It used to be __always_inline, but apparently LLVM/clang doesn't
> > propagate constants :(
> >
> > 4e59532541c8 ("nfp: don't depend on compiler constant propagation")
>
> Doesn't propagate constants or doesn't have exact same set of
> rules for __builtin_constant_p()? IOW, if you dropped that
> BUILD_BUG_ON(), what would be left after optimizations?

Hm. You're right. It just doesn't recognize the parameter as constant
in __builtin_constant_p(). I haven't compiled the actual code, but
here is a trivial test:

18:36 ~$ clang --version
clang version 4.0.1 (tags/RELEASE_401/final)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

18:36 ~$ cat /tmp/test.c
#include <stdio.h>

static inline int test_thing(unsigned int a)
{
printf("const: %d\n", __builtin_constant_p(a));
if (a > 10)
return a - 1;
return a;
}

int main()
{
printf("const: %d\n", __builtin_constant_p(0));
return test_thing(0);
}
18:36 ~$ clang -g /tmp/test.c -O2 -Wall -W -o /tmp/test
18:36 ~$ /tmp/test
const: 1
const: 0
18:36 ~$ objdump -S /tmp/test
...
00000000004004e0 <main>:
return a;
}

int main()
{
printf("const: %d\n", __builtin_constant_p(0));
4004e0: 50 push %rax
4004e1: bf a0 05 40 00 mov $0x4005a0,%edi
4004e6: be 01 00 00 00 mov $0x1,%esi
4004eb: 31 c0 xor %eax,%eax
4004ed: e8 fe fe ff ff callq 4003f0 <printf@plt>
printf("const: %d\n", __builtin_constant_p(a));
4004f2: bf a0 05 40 00 mov $0x4005a0,%edi
4004f7: 31 f6 xor %esi,%esi
4004f9: 31 c0 xor %eax,%eax
4004fb: e8 f0 fe ff ff callq 4003f0 <printf@plt>
return test_thing(0);
400500: 31 c0 xor %eax,%eax
400502: 59 pop %rcx
400503: c3 retq
400504: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40050b: 00 00 00
40050e: 66 90 xchg %ax,%ax

...

2017-12-13 14:22:20

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Tue, Dec 12, 2017 at 06:44:00PM -0800, Jakub Kicinski wrote:
> On Wed, 13 Dec 2017 01:51:25 +0000, Al Viro wrote:
> > On Tue, Dec 12, 2017 at 05:35:28PM -0800, Jakub Kicinski wrote:
> >
> > > It used to be __always_inline, but apparently LLVM/clang doesn't
> > > propagate constants :(
> > >
> > > 4e59532541c8 ("nfp: don't depend on compiler constant propagation")
> >
> > Doesn't propagate constants or doesn't have exact same set of
> > rules for __builtin_constant_p()? IOW, if you dropped that
> > BUILD_BUG_ON(), what would be left after optimizations?
>
> Hm. You're right. It just doesn't recognize the parameter as constant
> in __builtin_constant_p().

FWIW, clang does propagate them well enough to detect and optimize
multiplication by constant power of two. With the variant I've posted.

The check on field overflow (which works on gcc builds) does nothing on
clang - __builtin_constant_p() gives a flat-out false for arguments of
inline function there. I can understand their reasoning, even though
it's inconvenient in cases like this one - semantics of __builtin_constant_p()
is a fucking mess and the less you rely upon its details, the safer you
are...

Hell knows - a part of that can be recovered by taking the check into a
wrapper; as in

static __always_inline __le32_replace_bits(__le32 old, u32 v, u32 mask)
{
__le32 m = cpu_to_le32(mask);
return (old & ~m) | cpu_to_le32(v * mask/mask_to_multiplier(mask)));
}

#define le32_replace_bits(old, v, mask) ({ \
typeof(v) ____v = (v); \
typeof(mask) ____m = (mask); \
if (__builtin_constant_p(____v)) \
if (v & ~(____m / mask_to_multiplier(____m))) \
__field_overflow(); \
__l32_replace_bits(old, ____v, ____m); \
})

That would give that sanity check a better coverage on clang builds, but...
does it really buy us enough to bother, especially since all those macros
would have to be spelled out; you can have a macro expanding to definition
of static inline, but you can't have a macro expanding to anything that
would contain a preprocessor directive, including #define. And with that
kind of sanity checks, the first build with gcc will catch everything
missed by clang builds anyway.

IMO it's not worth the trouble; let's go with the check inside of
static inline and accept that on clang builds it'll do nothing.

Next question: where do we put that bunch? I've put it into
linux/byteorder/generic.h, so that anything picking fixed-endian primitives
would pick those as well; I hadn't thought of linux/bitfield.h at the time.
We certainly could put it there instead - it's never pulled by other headers,
so adding #include <asm/byteorder.h> into linux/bitfield.h is not going to
cause header order problems. Not sure...

Linus, do you have any preferences in that area?

2017-12-13 17:46:05

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Wed, Dec 13, 2017 at 02:22:12PM +0000, Al Viro wrote:

> Next question: where do we put that bunch? I've put it into
> linux/byteorder/generic.h, so that anything picking fixed-endian primitives
> would pick those as well; I hadn't thought of linux/bitfield.h at the time.
> We certainly could put it there instead - it's never pulled by other headers,
> so adding #include <asm/byteorder.h> into linux/bitfield.h is not going to
> cause header order problems. Not sure...
>
> Linus, do you have any preferences in that area?

After looking at some of the callers of bitfield.h stuff: it might be useful
to add

static inline void le64p_replace_bits(__le64 *p, u64 v, u64 mask)
{
__le64 m = cpu_to_le64(mask);
*p = (*p & ~m) | (cpu_to_le64(v * mask_to_multiplier(mask)) & m);
}

and similar for other types. Not sure what would be a good name for
host-endian variants - u64p_replace_bits() sounds a bit clumsy. Suggestions?

2017-12-13 19:05:11

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] new byteorder primitives - ..._{replace,get}_bits()

On Wed, 13 Dec 2017 14:22:12 +0000, Al Viro wrote:
> IMO it's not worth the trouble; let's go with the check inside of
> static inline and accept that on clang builds it'll do nothing.

Ack, thanks for investigating!

2017-12-15 02:33:49

by Al Viro

[permalink] [raw]
Subject: [RFC][PATCH] Add primitives for manipulating bitfields both in host- and fixed-endian.

The following primitives are defined in linux/bitfield.h:

* u32 le32_get_bits(__le32 val, u32 field) extracts the contents of the
bitfield specified by @field in little-endian 32bit value @val and
converts it to host-endian.

* void le32p_replace_bits(__le32 *p, u32 v, u32 field) replaces
the contents of the bitfield specified by @field in little-endian
32bit object pointet to by *p with the value of @v. New value is
given in host-endian and stored as little-endian.

* __le32 le32_replace_bits(__le32 old, u32 v, u32 field) is equivalent to
({__le32 tmp = old; le32p_replace_bits(&old, v, field); tmp;})
In other words, instead of modifying an object in memory, it takes
the initial value and returns the modified one.

Such set of helpers is defined for each of little-, big- and host-endian
types; e.g. u64_get_bits(val, field) will return the contents of the bitfield
specified by @field in host-endian 64bit value @val, etc. Of course, for
host-endian no conversion is involved.

Fields to access are specified as GENMASK() values - an N-bit field
starting at bit #M is encoded as GENMASK(M + N - 1, N). Note that
bit numbers refer to endianness of the object we are working with -
e.g. GENMASK(11, 0) in __be16 refers to the second byte and the lower
4 bits of the first byte. In __le16 it would refer to the first byte
and the lower 4 bits of the second byte, etc.

Field specification must be a constant; __builtin_constant_p() doesn't
have to be true for it, but compiler must be able to evaluate it at
build time. If it cannot or if the value does not encode any bitfield,
the build will fail.

If the value being stored in ..._replace_bits() is a constant that does
not fit into bitfield, a warning will be generated at compile time.

Signed-off-by: Al Viro <[email protected]>

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 1030651f8309..4b4f0531a79c 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -16,6 +16,7 @@
#define _LINUX_BITFIELD_H

#include <linux/build_bug.h>
+#include <asm/byteorder.h>

/*
* Bitfield access macros
@@ -103,4 +104,50 @@
(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
})

+extern void __compiletime_warning("value doesn't fit into mask")
+__field_overflow(void);
+extern void __compiletime_error("bad bitfield mask")
+__bad_mask(void);
+static __always_inline u64 mask_to_multiplier(u64 mask)
+{
+ if ((mask | (mask - 1)) & ((mask | (mask - 1)) + 1))
+ __bad_mask();
+ return mask & -mask;
+}
+
+#define ____MAKE_OP(type,base,to,from) \
+static __always_inline __##type type##_replace_bits(__##type old, \
+ base val, base field) \
+{ \
+ __##type m = to(field); \
+ if (__builtin_constant_p(val) && \
+ (val & ~(field/mask_to_multiplier(field)))) \
+ __field_overflow(); \
+ return (old & ~m) | \
+ (to(val * mask_to_multiplier(field)) & m); \
+} \
+static __always_inline void type##p_replace_bits(__##type *p, \
+ base val, base field) \
+{ \
+ __##type m = to(field); \
+ if (__builtin_constant_p(val) && \
+ (val & ~(field/mask_to_multiplier(field)))) \
+ __field_overflow(); \
+ *p = (*p & ~m) | \
+ (to(val * mask_to_multiplier(field)) & m); \
+} \
+static __always_inline base type##_get_bits(__##type v, base field) \
+{ \
+ return (from(v) & field)/mask_to_multiplier(field); \
+}
+#define __MAKE_OP(size) \
+ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
+ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \
+ ____MAKE_OP(u##size,u##size,,)
+__MAKE_OP(16)
+__MAKE_OP(32)
+__MAKE_OP(64)
+#undef __MAKE_OP
+#undef ____MAKE_OP
+
#endif

2017-12-15 05:07:24

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [RFC][PATCH] Add primitives for manipulating bitfields both in host- and fixed-endian.

Looks great to me!

On Fri, 15 Dec 2017 02:33:43 +0000, Al Viro wrote:
> The following primitives are defined in linux/bitfield.h:
>
> * u32 le32_get_bits(__le32 val, u32 field) extracts the contents of the
> bitfield specified by @field in little-endian 32bit value @val and
> converts it to host-endian.
>
> * void le32p_replace_bits(__le32 *p, u32 v, u32 field) replaces
> the contents of the bitfield specified by @field in little-endian
> 32bit object pointet to by *p with the value of @v. New value is
> given in host-endian and stored as little-endian.
>
> * __le32 le32_replace_bits(__le32 old, u32 v, u32 field) is equivalent to
> ({__le32 tmp = old; le32p_replace_bits(&old, v, field); tmp;})
> In other words, instead of modifying an object in memory, it takes
> the initial value and returns the modified one.

the current macros take filed/mask as first param, not sure if it's
worth maintaining the order

2017-12-15 05:34:08

by Al Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] Add primitives for manipulating bitfields both in host- and fixed-endian.

On Thu, Dec 14, 2017 at 09:07:13PM -0800, Jakub Kicinski wrote:
> Looks great to me!
>
> On Fri, 15 Dec 2017 02:33:43 +0000, Al Viro wrote:
> > The following primitives are defined in linux/bitfield.h:
> >
> > * u32 le32_get_bits(__le32 val, u32 field) extracts the contents of the
> > bitfield specified by @field in little-endian 32bit value @val and
> > converts it to host-endian.
> >
> > * void le32p_replace_bits(__le32 *p, u32 v, u32 field) replaces
> > the contents of the bitfield specified by @field in little-endian
> > 32bit object pointet to by *p with the value of @v. New value is
> > given in host-endian and stored as little-endian.
> >
> > * __le32 le32_replace_bits(__le32 old, u32 v, u32 field) is equivalent to
> > ({__le32 tmp = old; le32p_replace_bits(&old, v, field); tmp;})
> > In other words, instead of modifying an object in memory, it takes
> > the initial value and returns the modified one.
>
> the current macros take filed/mask as first param, not sure if it's
> worth maintaining the order

Umm... For something like Haskell that would be more natural (as in
replace_foo = replace_field foo), but it's C - no partially applied
functions here...

While we are at it, to cover the FIELD_PREP users it might make sense to
add

__le32 le32_encode_bits(u32 v, u32 field)
{
if (__builtin_constant_p(v) &&
(v & ~(field/mask_to_multiplier(field))))
__field_overflow();
return cpu_to_le32((v * mask_to_multiplier(field)) & field);
}

turning the body of le32_replace_bits into
return (old & ~cpu_to_le32(field)) | le32_encode_bits(v, field);

2017-12-15 16:48:18

by Al Viro

[permalink] [raw]
Subject: [RFC][PATCH v2] Add primitives for manipulating bitfields both in host- and fixed-endian.

[Folks, please review and comment; if no objections show up, into -next it goes]

The following primitives are defined in linux/bitfield.h:

* u32 le32_get_bits(__le32 val, u32 field) extracts the contents of the
bitfield specified by @field in little-endian 32bit object @val and
converts it to host-endian.

* void le32p_replace_bits(__le32 *p, u32 v, u32 field) replaces
the contents of the bitfield specified by @field in little-endian
32bit object pointed to by @p with the value of @v. New value is
given in host-endian and stored as little-endian.

* __le32 le32_replace_bits(__le32 old, u32 v, u32 field) is equivalent to
({__le32 tmp = old; le32p_replace_bits(&tmp, v, field); tmp;})
In other words, instead of modifying an object in memory, it takes
the initial value and returns the modified one.

* __le32 le32_encode_bits(u32 v, u32 field) is equivalent to
le32_replace_bits(0, v, field). In other words, it returns a little-endian
32bit object with the bitfield specified by @field containing the
value of @v and all bits outside that bitfield being zero.

Such set of helpers is defined for each of little-, big- and host-endian
types; e.g. u64_get_bits(val, field) will return the contents of the bitfield
specified by @field in host-endian 64bit object @val, etc. Of course, for
host-endian no conversion is involved.

Fields to access are specified as GENMASK() values - an N-bit field
starting at bit #M is encoded as GENMASK(M + N - 1, M). Note that
bit numbers refer to endianness of the object we are working with -
e.g. GENMASK(11, 0) in __be16 refers to the second byte and the lower
4 bits of the first byte. In __le16 it would refer to the first byte
and the lower 4 bits of the second byte, etc.

Field specification must be a constant; __builtin_constant_p() doesn't
have to be true for it, but compiler must be able to evaluate it at
build time. If it cannot or if the value does not encode any bitfield,
the build will fail.

If the value being stored in a bitfield is a constant that does not fit
into that bitfield, a warning will be generated at compile time.

Signed-off-by: Al Viro <[email protected]>

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 1030651f8309..cf2588d81148 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -16,6 +16,7 @@
#define _LINUX_BITFIELD_H

#include <linux/build_bug.h>
+#include <asm/byteorder.h>

/*
* Bitfield access macros
@@ -103,4 +104,49 @@
(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
})

+extern void __compiletime_warning("value doesn't fit into mask")
+__field_overflow(void);
+extern void __compiletime_error("bad bitfield mask")
+__bad_mask(void);
+static __always_inline u64 field_multiplier(u64 field)
+{
+ if ((field | (field - 1)) & ((field | (field - 1)) + 1))
+ __bad_mask();
+ return field & -field;
+}
+static __always_inline u64 field_mask(u64 field)
+{
+ return field / field_multiplier(field);
+}
+#define ____MAKE_OP(type,base,to,from) \
+static __always_inline __##type type##_encode_bits(base v, base field) \
+{ \
+ if (__builtin_constant_p(v) && (v & ~field_multiplier(field))) \
+ __field_overflow(); \
+ return to((v & field_mask(field)) * field_multiplier(field)); \
+} \
+static __always_inline __##type type##_replace_bits(__##type old, \
+ base val, base field) \
+{ \
+ return (old & ~to(field)) | type##_encode_bits(val, field); \
+} \
+static __always_inline void type##p_replace_bits(__##type *p, \
+ base val, base field) \
+{ \
+ *p = (*p & ~to(field)) | type##_encode_bits(val, field); \
+} \
+static __always_inline base type##_get_bits(__##type v, base field) \
+{ \
+ return (from(v) & field)/field_multiplier(field); \
+}
+#define __MAKE_OP(size) \
+ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
+ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \
+ ____MAKE_OP(u##size,u##size,,)
+__MAKE_OP(16)
+__MAKE_OP(32)
+__MAKE_OP(64)
+#undef __MAKE_OP
+#undef ____MAKE_OP
+
#endif