2020-01-14 20:10:37

by Vineet Gupta

[permalink] [raw]
Subject: [RFC 4/4] ARC: uaccess: use optimized generic __strnlen_user/__strncpy_from_user

These rely on word access rather than byte loop

Signed-off-by: Vineet Gupta <[email protected]>
---
arch/arc/Kconfig | 2 +
arch/arc/include/asm/Kbuild | 1 -
arch/arc/include/asm/uaccess.h | 71 ++-------------------------
arch/arc/include/asm/word-at-a-time.h | 49 ++++++++++++++++++
4 files changed, 56 insertions(+), 67 deletions(-)
create mode 100644 arch/arc/include/asm/word-at-a-time.h

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 26108ea785c2..3b074c4d31fb 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -26,6 +26,8 @@ config ARC
select GENERIC_PENDING_IRQ if SMP
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
+ select GENERIC_STRNCPY_FROM_USER if MMU
+ select GENERIC_STRNLEN_USER if MMU
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_DEBUG_STACKOVERFLOW
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 1b505694691e..cb8d459b7f56 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -24,5 +24,4 @@ generic-y += topology.h
generic-y += trace_clock.h
generic-y += user.h
generic-y += vga.h
-generic-y += word-at-a-time.h
generic-y += xor.h
diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index 0b34c152086f..f579e06447a9 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -23,7 +23,6 @@

#include <linux/string.h> /* for generic string functions */

-
#define __kernel_ok (uaccess_kernel())

/*
@@ -52,6 +51,8 @@
#define __access_ok(addr, sz) (unlikely(__kernel_ok) || \
likely(__user_ok((addr), (sz))))

+#define user_addr_max() (uaccess_kernel() ? ~0UL : get_fs())
+
/*********** Single byte/hword/word copies ******************/

#define __get_user_fn(sz, u, k) \
@@ -655,75 +656,13 @@ static inline unsigned long __clear_user(void __user *to, unsigned long n)
return res;
}

-static inline long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
- long res = 0;
- char val;
-
- if (count == 0)
- return 0;
-
- __asm__ __volatile__(
- " mov lp_count, %5 \n"
- " lp 3f \n"
- "1: ldb.ab %3, [%2, 1] \n"
- " breq.d %3, 0, 3f \n"
- " stb.ab %3, [%1, 1] \n"
- " add %0, %0, 1 # Num of NON NULL bytes copied \n"
- "3: \n"
- " .section .fixup, \"ax\" \n"
- " .align 4 \n"
- "4: mov %0, %4 # sets @res as -EFAULT \n"
- " j 3b \n"
- " .previous \n"
- " .section __ex_table, \"a\" \n"
- " .align 4 \n"
- " .word 1b, 4b \n"
- " .previous \n"
- : "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
- : "g"(-EFAULT), "r"(count)
- : "lp_count", "memory");
-
- return res;
-}
-
-static inline long __strnlen_user(const char __user *s, long n)
-{
- long res, tmp1, cnt;
- char val;
-
- __asm__ __volatile__(
- " mov %2, %1 \n"
- "1: ldb.ab %3, [%0, 1] \n"
- " breq.d %3, 0, 2f \n"
- " sub.f %2, %2, 1 \n"
- " bnz 1b \n"
- " sub %2, %2, 1 \n"
- "2: sub %0, %1, %2 \n"
- "3: ;nop \n"
- " .section .fixup, \"ax\" \n"
- " .align 4 \n"
- "4: mov %0, 0 \n"
- " j 3b \n"
- " .previous \n"
- " .section __ex_table, \"a\" \n"
- " .align 4 \n"
- " .word 1b, 4b \n"
- " .previous \n"
- : "=r"(res), "=r"(tmp1), "=r"(cnt), "=r"(val)
- : "0"(s), "1"(n)
- : "memory");
-
- return res;
-}
-
#define INLINE_COPY_TO_USER
#define INLINE_COPY_FROM_USER

#define __clear_user __clear_user
-#define __strncpy_from_user __strncpy_from_user
-#define __strnlen_user __strnlen_user
+
+extern long strncpy_from_user(char *dest, const char __user *src, long count);
+extern __must_check long strnlen_user(const char __user *str, long n);

#include <asm/segment.h>
#include <asm-generic/uaccess.h>
diff --git a/arch/arc/include/asm/word-at-a-time.h b/arch/arc/include/asm/word-at-a-time.h
new file mode 100644
index 000000000000..00e92be70987
--- /dev/null
+++ b/arch/arc/include/asm/word-at-a-time.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Synopsys Inc.
+ */
+#ifndef __ASM_ARC_WORD_AT_A_TIME_H
+#define __ASM_ARC_WORD_AT_A_TIME_H
+
+#ifdef __LITTLE_ENDIAN__
+
+#include <linux/kernel.h>
+
+struct word_at_a_time {
+ const unsigned long one_bits, high_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
+
+static inline unsigned long has_zero(unsigned long a, unsigned long *bits,
+ const struct word_at_a_time *c)
+{
+ unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
+ *bits = mask;
+ return mask;
+}
+
+#define prep_zero_mask(a, bits, c) (bits)
+
+static inline unsigned long create_zero_mask(unsigned long bits)
+{
+ bits = (bits - 1) & ~bits;
+ return bits >> 7;
+}
+
+static inline unsigned long find_zero(unsigned long mask)
+{
+#ifdef CONFIG_64BIT
+ return fls64(mask) >> 3;
+#else
+ return fls(mask) >> 3;
+#endif
+}
+
+#define zero_bytemask(mask) (mask)
+
+#else /* __BIG_ENDIAN__ */
+#include <asm-generic/word-at-a-time.h>
+#endif
+
+#endif /* __ASM_WORD_AT_A_TIME_H */
--
2.20.1


2020-01-14 20:43:45

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [RFC 4/4] ARC: uaccess: use optimized generic __strnlen_user/__strncpy_from_user

On Tue, Jan 14, 2020 at 9:08 PM Vineet Gupta <[email protected]> wrote:

> diff --git a/arch/arc/include/asm/word-at-a-time.h b/arch/arc/include/asm/word-at-a-time.h
> new file mode 100644
> index 000000000000..00e92be70987
> --- /dev/null
> +++ b/arch/arc/include/asm/word-at-a-time.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2020 Synopsys Inc.
> + */
> +#ifndef __ASM_ARC_WORD_AT_A_TIME_H
> +#define __ASM_ARC_WORD_AT_A_TIME_H
> +
> +#ifdef __LITTLE_ENDIAN__
> +
> +#include <linux/kernel.h>
> +
> +struct word_at_a_time {
> + const unsigned long one_bits, high_bits;
> +};

What's wrong with the generic version on little-endian? Any
chance you can find a way to make it work as well for you as
this copy?

> +static inline unsigned long find_zero(unsigned long mask)
> +{
> +#ifdef CONFIG_64BIT
> + return fls64(mask) >> 3;
> +#else
> + return fls(mask) >> 3;
> +#endif

The CONFIG_64BIT check not be needed, unless you are adding
support for 64-bit ARC really soon.

Arnd

2020-01-14 21:38:20

by Vineet Gupta

[permalink] [raw]
Subject: Re: [RFC 4/4] ARC: uaccess: use optimized generic __strnlen_user/__strncpy_from_user

On 1/14/20 12:42 PM, Arnd Bergmann wrote:
> On Tue, Jan 14, 2020 at 9:08 PM Vineet Gupta <[email protected]> wrote:
>
>> diff --git a/arch/arc/include/asm/word-at-a-time.h b/arch/arc/include/asm/word-at-a-time.h
>> new file mode 100644
>> index 000000000000..00e92be70987
>> --- /dev/null
>> +++ b/arch/arc/include/asm/word-at-a-time.h
>> @@ -0,0 +1,49 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (C) 2020 Synopsys Inc.
>> + */
>> +#ifndef __ASM_ARC_WORD_AT_A_TIME_H
>> +#define __ASM_ARC_WORD_AT_A_TIME_H
>> +
>> +#ifdef __LITTLE_ENDIAN__
>> +
>> +#include <linux/kernel.h>
>> +
>> +struct word_at_a_time {
>> + const unsigned long one_bits, high_bits;
>> +};
>
> What's wrong with the generic version on little-endian? Any
> chance you can find a way to make it work as well for you as
> this copy?

find_zero() by default doesn't use pop count instructions. I didn't like the copy
either but wasn't sure of the best way to make this 4 API interface reusable. Are
you suggesting we allow partial over-ride starting with #ifndef find_zero ?

>> +static inline unsigned long find_zero(unsigned long mask)
>> +{
>> +#ifdef CONFIG_64BIT
>> + return fls64(mask) >> 3;
>> +#else
>> + return fls(mask) >> 3;
>> +#endif
>
> The CONFIG_64BIT check not be needed, unless you are adding
> support for 64-bit ARC really soon.

:-) Indeed that was the premise !

Thx for the quick review.

-Vineet

2020-01-14 21:50:49

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC 4/4] ARC: uaccess: use optimized generic __strnlen_user/__strncpy_from_user

On Tue, Jan 14, 2020 at 1:37 PM Vineet Gupta <[email protected]> wrote:
>
> On 1/14/20 12:42 PM, Arnd Bergmann wrote:
> >
> > What's wrong with the generic version on little-endian? Any
> > chance you can find a way to make it work as well for you as
> > this copy?
>
> find_zero() by default doesn't use pop count instructions.

Don't you think the generic find_zero() is likely just as fast as the
pop count instruction? On 32-bit, I think it's like a shift and a mask
and a couple of additions.

The 64-bit case has a multiply that is likely expensive unless you
have a good multiplication unit (but what 64-bit architecture
doesn't?), but the generic 32-bit LE code should already be pretty
close to optimal, and it might not be worth it to worry about it.

(The big-endian case is very different, and architectures really can
do much better. But LE allows for bit tricks using the carry chain)

Linus

2020-01-14 22:15:53

by Vineet Gupta

[permalink] [raw]
Subject: Re: [RFC 4/4] ARC: uaccess: use optimized generic __strnlen_user/__strncpy_from_user

On 1/14/20 1:49 PM, Linus Torvalds wrote:
> On Tue, Jan 14, 2020 at 1:37 PM Vineet Gupta <[email protected]> wrote:
>>
>> On 1/14/20 12:42 PM, Arnd Bergmann wrote:
>>>
>>> What's wrong with the generic version on little-endian? Any
>>> chance you can find a way to make it work as well for you as
>>> this copy?
>>
>> find_zero() by default doesn't use pop count instructions.
>
> Don't you think the generic find_zero() is likely just as fast as the
> pop count instruction? On 32-bit, I think it's like a shift and a mask
> and a couple of additions.

You are right that in grand scheme things it may be less than noise.

ARC pop count version

# bits = (bits - 1) & ~bits;
# return bits >> 7;

sub r0,r6,1
bic r6,r0,r6
lsr r0,r6,7

# return fls(mask) >> 3;

fls.f r0, r0
add.nz r0, r0, 1
asr r5,r0,3

j_s.d [blink]

Generic version

# bits = (bits - 1) & ~bits;
# return bits >> 7;

sub r5,r6,1
bic r6,r5,r6
lsr r5,r6,7

# unsigned long a = (0x0ff0001+mask) >> 23;
# return a & mask;

add r0,r5,0x0ff0001 <-- this is 8 byte instruction though
lsr_s r0,r0,23
and r5,r5,r0

j_s.d [blink]


But its the usual itch/inclination of arch people to try and use the specific
instruction if available.

>
> The 64-bit case has a multiply that is likely expensive unless you
> have a good multiplication unit (but what 64-bit architecture
> doesn't?), but the generic 32-bit LE code should already be pretty
> close to optimal, and it might not be worth it to worry about it.
>
> (The big-endian case is very different, and architectures really can
> do much better. But LE allows for bit tricks using the carry chain)

-Vineet