2013-05-16 11:11:24

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

This improves the might_fault annotations used
by uaccess routines:

1. The only reason uaccess routines might sleep
is if they fault. Make this explicit for
all architectures.
2. Accesses (e.g through socket ops) to kernel memory
with KERNEL_DS like net/sunrpc does will never sleep.
Remove an unconditinal might_sleep in the inline
might_fault in kernel.h
(used when PROVE_LOCKING is not set).
3. Accesses with pagefault_disable return EFAULT
but won't cause caller to sleep.
Check for that and avoid might_sleep when
PROVE_LOCKING is set.

I'd like these changes to go in for the benefit of
the vhost driver where we want to call socket ops
under a spinlock, and fall back on slower thread handler
on error.

Please review, and consider for 3.11.


If the changes look good, what's the best way to merge them?
Maybe core/locking makes sense?

Note on arch code updates:
I tested x86_64 code.
Other architectures were build-tested.
I don't have cross-build environment for arm64, tile, microblaze and
mn10300 architectures. The changes look safe enough
but would appreciate review/acks from arch maintainers.

Version 1 of this change was titled
x86: uaccess s/might_sleep/might_fault/


Changes from v1:
add more architectures
fix might_fault() scheduling differently depending
on CONFIG_PROVE_LOCKING, as suggested by Ingo


Michael S. Tsirkin (10):
asm-generic: uaccess s/might_sleep/might_fault/
arm64: uaccess s/might_sleep/might_fault/
frv: uaccess s/might_sleep/might_fault/
m32r: uaccess s/might_sleep/might_fault/
microblaze: uaccess s/might_sleep/might_fault/
mn10300: uaccess s/might_sleep/might_fault/
powerpc: uaccess s/might_sleep/might_fault/
tile: uaccess s/might_sleep/might_fault/
x86: uaccess s/might_sleep/might_fault/
kernel: might_fault does not imply might_sleep

arch/arm64/include/asm/uaccess.h | 4 ++--
arch/frv/include/asm/uaccess.h | 4 ++--
arch/m32r/include/asm/uaccess.h | 12 ++++++------
arch/microblaze/include/asm/uaccess.h | 6 +++---
arch/mn10300/include/asm/uaccess.h | 4 ++--
arch/powerpc/include/asm/uaccess.h | 16 ++++++++--------
arch/tile/include/asm/uaccess.h | 2 +-
arch/x86/include/asm/uaccess_64.h | 2 +-
include/asm-generic/uaccess.h | 10 +++++-----
include/linux/kernel.h | 1 -
mm/memory.c | 14 +++++++++-----
11 files changed, 39 insertions(+), 36 deletions(-)

Thanks,

--
MST


2013-05-16 11:11:57

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 02/10] arm64: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/arm64/include/asm/uaccess.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 008f848..edb3d5c 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -166,7 +166,7 @@ do { \

#define get_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_READ, (ptr), sizeof(*(ptr))) ? \
__get_user((x), (ptr)) : \
((x) = 0, -EFAULT); \
@@ -227,7 +227,7 @@ do { \

#define put_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \
__put_user((x), (ptr)) : \
-EFAULT; \
--
MST

2013-05-16 11:13:28

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 04/10] m32r: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/m32r/include/asm/uaccess.h | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 1c7047b..84fe7ba 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -216,7 +216,7 @@ extern int fixup_exception(struct pt_regs *regs);
({ \
long __gu_err = 0; \
unsigned long __gu_val; \
- might_sleep(); \
+ might_fault(); \
__get_user_size(__gu_val,(ptr),(size),__gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
__gu_err; \
@@ -227,7 +227,7 @@ extern int fixup_exception(struct pt_regs *regs);
long __gu_err = -EFAULT; \
unsigned long __gu_val = 0; \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_READ,__gu_addr,size)) \
__get_user_size(__gu_val,__gu_addr,(size),__gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
@@ -295,7 +295,7 @@ do { \
#define __put_user_nocheck(x,ptr,size) \
({ \
long __pu_err; \
- might_sleep(); \
+ might_fault(); \
__put_user_size((x),(ptr),(size),__pu_err); \
__pu_err; \
})
@@ -305,7 +305,7 @@ do { \
({ \
long __pu_err = -EFAULT; \
__typeof__(*(ptr)) __user *__pu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_WRITE,__pu_addr,size)) \
__put_user_size((x),__pu_addr,(size),__pu_err); \
__pu_err; \
@@ -597,7 +597,7 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
*/
#define copy_to_user(to,from,n) \
({ \
- might_sleep(); \
+ might_fault(); \
__generic_copy_to_user((to),(from),(n)); \
})

@@ -638,7 +638,7 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
*/
#define copy_from_user(to,from,n) \
({ \
- might_sleep(); \
+ might_fault(); \
__generic_copy_from_user((to),(from),(n)); \
})

--
MST

2013-05-16 11:13:53

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 05/10] microblaze: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/microblaze/include/asm/uaccess.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index efe59d8..2fc8bf7 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -145,7 +145,7 @@ static inline unsigned long __must_check __clear_user(void __user *to,
static inline unsigned long __must_check clear_user(void __user *to,
unsigned long n)
{
- might_sleep();
+ might_fault();
if (unlikely(!access_ok(VERIFY_WRITE, to, n)))
return n;

@@ -371,7 +371,7 @@ extern long __user_bad(void);
static inline long copy_from_user(void *to,
const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_READ, from, n))
return __copy_from_user(to, from, n);
return n;
@@ -385,7 +385,7 @@ static inline long copy_from_user(void *to,
static inline long copy_to_user(void __user *to,
const void *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
return __copy_to_user(to, from, n);
return n;
--
MST

2013-05-16 11:16:22

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 03/10] frv: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/frv/include/asm/uaccess.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index 0b67ec5..3ac9a59 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -280,14 +280,14 @@ extern long __memcpy_user(void *dst, const void *src, unsigned long count);
static inline unsigned long __must_check
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
- might_sleep();
+ might_fault();
return __copy_to_user_inatomic(to, from, n);
}

static inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
return __copy_from_user_inatomic(to, from, n);
}

--
MST

2013-05-16 11:18:20

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 07/10] powerpc: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/powerpc/include/asm/uaccess.h | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 4db4959..9485b43 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -178,7 +178,7 @@ do { \
long __pu_err; \
__typeof__(*(ptr)) __user *__pu_addr = (ptr); \
if (!is_kernel_addr((unsigned long)__pu_addr)) \
- might_sleep(); \
+ might_fault(); \
__chk_user_ptr(ptr); \
__put_user_size((x), __pu_addr, (size), __pu_err); \
__pu_err; \
@@ -188,7 +188,7 @@ do { \
({ \
long __pu_err = -EFAULT; \
__typeof__(*(ptr)) __user *__pu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_WRITE, __pu_addr, size)) \
__put_user_size((x), __pu_addr, (size), __pu_err); \
__pu_err; \
@@ -268,7 +268,7 @@ do { \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
__chk_user_ptr(ptr); \
if (!is_kernel_addr((unsigned long)__gu_addr)) \
- might_sleep(); \
+ might_fault(); \
__get_user_size(__gu_val, __gu_addr, (size), __gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
__gu_err; \
@@ -282,7 +282,7 @@ do { \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
__chk_user_ptr(ptr); \
if (!is_kernel_addr((unsigned long)__gu_addr)) \
- might_sleep(); \
+ might_fault(); \
__get_user_size(__gu_val, __gu_addr, (size), __gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
__gu_err; \
@@ -294,7 +294,7 @@ do { \
long __gu_err = -EFAULT; \
unsigned long __gu_val = 0; \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_READ, __gu_addr, (size))) \
__get_user_size(__gu_val, __gu_addr, (size), __gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
@@ -419,14 +419,14 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
static inline unsigned long __copy_from_user(void *to,
const void __user *from, unsigned long size)
{
- might_sleep();
+ might_fault();
return __copy_from_user_inatomic(to, from, size);
}

static inline unsigned long __copy_to_user(void __user *to,
const void *from, unsigned long size)
{
- might_sleep();
+ might_fault();
return __copy_to_user_inatomic(to, from, size);
}

@@ -434,7 +434,7 @@ extern unsigned long __clear_user(void __user *addr, unsigned long size);

static inline unsigned long clear_user(void __user *addr, unsigned long size)
{
- might_sleep();
+ might_fault();
if (likely(access_ok(VERIFY_WRITE, addr, size)))
return __clear_user(addr, size);
if ((unsigned long)addr < TASK_SIZE) {
--
MST

2013-05-16 11:18:41

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 08/10] tile: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/tile/include/asm/uaccess.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index 8a082bc..e4d44bd 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -442,7 +442,7 @@ extern unsigned long __copy_in_user_inatomic(
static inline unsigned long __must_check
__copy_in_user(void __user *to, const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
return __copy_in_user_inatomic(to, from, n);
}

--
MST

2013-05-16 11:19:06

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 09/10] x86: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/uaccess_64.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 142810c..4f7923d 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -235,7 +235,7 @@ extern long __copy_user_nocache(void *dst, const void __user *src,
static inline int
__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
{
- might_sleep();
+ might_fault();
return __copy_user_nocache(dst, src, size, 1);
}

--
MST

2013-05-16 11:19:28

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

There are several ways to make sure might_fault
calling function does not sleep.
One is to use it on kernel or otherwise locked memory - apparently
nfs/sunrpc does this. As noted by Ingo, this is handled by the
migh_fault() implementation in mm/memory.c but not the one in
linux/kernel.h so in the current code might_fault() schedules
differently depending on CONFIG_PROVE_LOCKING, which is an undesired
semantical side effect.

Another is to call pagefault_disable: in this case the page fault
handler will go to fixups processing and we get an error instead of
sleeping, so the might_sleep annotation is a false positive.
vhost driver wants to do this now in order to reuse socket ops
under a spinlock (and fall back on slower thread handler
on error).

Address both issues by:
- dropping the unconditional call to might_sleep
from the fast might_fault code in linux/kernel.h
- checking for pagefault_disable() in the
CONFIG_PROVE_LOCKING implementation

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
include/linux/kernel.h | 1 -
mm/memory.c | 14 +++++++++-----
2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e96329c..322b065 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -198,7 +198,6 @@ void might_fault(void);
#else
static inline void might_fault(void)
{
- might_sleep();
}
#endif

diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882..1b8327b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4222,13 +4222,17 @@ void might_fault(void)
if (segment_eq(get_fs(), KERNEL_DS))
return;

- might_sleep();
/*
- * it would be nicer only to annotate paths which are not under
- * pagefault_disable, however that requires a larger audit and
- * providing helpers like get_user_atomic.
+ * It would be nicer to annotate paths which are under preempt_disable
+ * but not under pagefault_disable, however that requires a new flag
+ * for differentiating between the two.
*/
- if (!in_atomic() && current->mm)
+ if (in_atomic())
+ return;
+
+ might_sleep();
+
+ if (current->mm)
might_lock_read(&current->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
--
MST

2013-05-16 11:18:05

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 06/10] mn10300: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/mn10300/include/asm/uaccess.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 780560b..107508a 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -471,13 +471,13 @@ extern unsigned long __generic_copy_from_user(void *, const void __user *,

#define __copy_to_user(to, from, n) \
({ \
- might_sleep(); \
+ might_fault(); \
__copy_to_user_inatomic((to), (from), (n)); \
})

#define __copy_from_user(to, from, n) \
({ \
- might_sleep(); \
+ might_fault(); \
__copy_from_user_inatomic((to), (from), (n)); \
})

--
MST

2013-05-16 11:21:15

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v2 01/10] asm-generic: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
include/asm-generic/uaccess.h | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index c184aa8..dc1269c 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -163,7 +163,7 @@ static inline __must_check long __copy_to_user(void __user *to,

#define put_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_WRITE, ptr, sizeof(*ptr)) ? \
__put_user(x, ptr) : \
-EFAULT; \
@@ -225,7 +225,7 @@ extern int __put_user_bad(void) __attribute__((noreturn));

#define get_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_READ, ptr, sizeof(*ptr)) ? \
__get_user(x, ptr) : \
-EFAULT; \
@@ -255,7 +255,7 @@ extern int __get_user_bad(void) __attribute__((noreturn));
static inline long copy_from_user(void *to,
const void __user * from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_READ, from, n))
return __copy_from_user(to, from, n);
else
@@ -265,7 +265,7 @@ static inline long copy_from_user(void *to,
static inline long copy_to_user(void __user *to,
const void *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
return __copy_to_user(to, from, n);
else
@@ -336,7 +336,7 @@ __clear_user(void __user *to, unsigned long n)
static inline __must_check unsigned long
clear_user(void __user *to, unsigned long n)
{
- might_sleep();
+ might_fault();
if (!access_ok(VERIFY_WRITE, to, n))
return n;

--
MST

2013-05-16 13:29:27

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH v2 02/10] arm64: uaccess s/might_sleep/might_fault/

On 16 May 2013 12:10, Michael S. Tsirkin <[email protected]> wrote:
> The only reason uaccess routines might sleep
> is if they fault. Make this explicit.
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> arch/arm64/include/asm/uaccess.h | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)

For arm64:

Acked-by: Catalin Marinas <[email protected]>

2013-05-16 13:33:27

by Chris Metcalf

[permalink] [raw]
Subject: Re: [PATCH v2 08/10] tile: uaccess s/might_sleep/might_fault/

On 5/16/2013 7:15 AM, Michael S. Tsirkin wrote:
> The only reason uaccess routines might sleep
> is if they fault. Make this explicit.
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> arch/tile/include/asm/uaccess.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>

Acked-by: Chris Metcalf <[email protected]>

--
Chris Metcalf, Tilera Corp.
http://www.tilera.com

2013-05-16 18:45:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Thu, May 16, 2013 at 02:16:10PM +0300, Michael S. Tsirkin wrote:
> There are several ways to make sure might_fault
> calling function does not sleep.
> One is to use it on kernel or otherwise locked memory - apparently
> nfs/sunrpc does this. As noted by Ingo, this is handled by the
> migh_fault() implementation in mm/memory.c but not the one in
> linux/kernel.h so in the current code might_fault() schedules
> differently depending on CONFIG_PROVE_LOCKING, which is an undesired
> semantical side effect.
>
> Another is to call pagefault_disable: in this case the page fault
> handler will go to fixups processing and we get an error instead of
> sleeping, so the might_sleep annotation is a false positive.
> vhost driver wants to do this now in order to reuse socket ops
> under a spinlock (and fall back on slower thread handler
> on error).

Are you using the assumption that spin_lock() implies preempt_disable() implies
pagefault_disable()? Note that this assumption isn't valid for -rt where the
spinlock becomes preemptible but we'll not disable pagefaults.

> Address both issues by:
> - dropping the unconditional call to might_sleep
> from the fast might_fault code in linux/kernel.h
> - checking for pagefault_disable() in the
> CONFIG_PROVE_LOCKING implementation
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> include/linux/kernel.h | 1 -
> mm/memory.c | 14 +++++++++-----
> 2 files changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e96329c..322b065 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -198,7 +198,6 @@ void might_fault(void);
> #else
> static inline void might_fault(void)
> {
> - might_sleep();

This removes potential resched points for PREEMPT_VOLUNTARY -- was that
intentional?

> }
> #endif
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 6dc1882..1b8327b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4222,13 +4222,17 @@ void might_fault(void)
> if (segment_eq(get_fs(), KERNEL_DS))
> return;
>
> - might_sleep();
> /*
> - * it would be nicer only to annotate paths which are not under
> - * pagefault_disable, however that requires a larger audit and
> - * providing helpers like get_user_atomic.
> + * It would be nicer to annotate paths which are under preempt_disable
> + * but not under pagefault_disable, however that requires a new flag
> + * for differentiating between the two.

-rt has this, pagefault_disable() doesn't change the preempt count but pokes
at task_struct::pagefault_disable.

> */
> - if (!in_atomic() && current->mm)
> + if (in_atomic())
> + return;
> +
> + might_sleep();
> +
> + if (current->mm)
> might_lock_read(&current->mm->mmap_sem);
> }
> EXPORT_SYMBOL(might_fault);
> --
> MST

2013-05-19 09:37:17

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Thu, May 16, 2013 at 08:40:41PM +0200, Peter Zijlstra wrote:
> On Thu, May 16, 2013 at 02:16:10PM +0300, Michael S. Tsirkin wrote:
> > There are several ways to make sure might_fault
> > calling function does not sleep.
> > One is to use it on kernel or otherwise locked memory - apparently
> > nfs/sunrpc does this. As noted by Ingo, this is handled by the
> > migh_fault() implementation in mm/memory.c but not the one in
> > linux/kernel.h so in the current code might_fault() schedules
> > differently depending on CONFIG_PROVE_LOCKING, which is an undesired
> > semantical side effect.
> >
> > Another is to call pagefault_disable: in this case the page fault
> > handler will go to fixups processing and we get an error instead of
> > sleeping, so the might_sleep annotation is a false positive.
> > vhost driver wants to do this now in order to reuse socket ops
> > under a spinlock (and fall back on slower thread handler
> > on error).
>
> Are you using the assumption that spin_lock() implies preempt_disable() implies
> pagefault_disable()? Note that this assumption isn't valid for -rt where the
> spinlock becomes preemptible but we'll not disable pagefaults.

No, I was not assuming that. What I'm trying to say is that a caller
that does something like this under a spinlock:
preempt_disable
pagefault_disable
error = copy_to_user
pagefault_enable
preempt_enable_no_resched

is not doing anything wrong and should not get a warning,
as long as error is handled correctly later.
Right?

> > Address both issues by:
> > - dropping the unconditional call to might_sleep
> > from the fast might_fault code in linux/kernel.h
> > - checking for pagefault_disable() in the
> > CONFIG_PROVE_LOCKING implementation
> >
> > Signed-off-by: Michael S. Tsirkin <[email protected]>
> > ---
> > include/linux/kernel.h | 1 -
> > mm/memory.c | 14 +++++++++-----
> > 2 files changed, 9 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e96329c..322b065 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -198,7 +198,6 @@ void might_fault(void);
> > #else
> > static inline void might_fault(void)
> > {
> > - might_sleep();
>
> This removes potential resched points for PREEMPT_VOLUNTARY -- was that
> intentional?

No it's a bug. Thanks for pointing this out.
OK so I guess it should be might_sleep_if(!in_atomic())
and this means might_fault would have to move from linux/kernel.h to
linux/uaccess.h, since in_atomic() is in linux/hardirq.h

Makes sense?

> > }
> > #endif
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 6dc1882..1b8327b 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4222,13 +4222,17 @@ void might_fault(void)
> > if (segment_eq(get_fs(), KERNEL_DS))
> > return;
> >
> > - might_sleep();
> > /*
> > - * it would be nicer only to annotate paths which are not under
> > - * pagefault_disable, however that requires a larger audit and
> > - * providing helpers like get_user_atomic.
> > + * It would be nicer to annotate paths which are under preempt_disable
> > + * but not under pagefault_disable, however that requires a new flag
> > + * for differentiating between the two.
>
> -rt has this, pagefault_disable() doesn't change the preempt count but pokes
> at task_struct::pagefault_disable.

Good to know.

So maybe we can import this at least for CONFIG_PROVE_LOCKING?
To make the patch smaller I'd prefer doing both for now,
this way this patchset does not have to poke in too many
mm internals.
I can try doing that - unless
someone else has plans to merge this part soon anyway?

> > */
> > - if (!in_atomic() && current->mm)
> > + if (in_atomic())
> > + return;
> > +
> > + might_sleep();
> > +
> > + if (current->mm)
> > might_lock_read(&current->mm->mmap_sem);
> > }
> > EXPORT_SYMBOL(might_fault);
> > --
> > MST

2013-05-19 12:34:13

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, 2013-05-19 at 12:35 +0300, Michael S. Tsirkin wrote:

> No, I was not assuming that. What I'm trying to say is that a caller
> that does something like this under a spinlock:
> preempt_disable
> pagefault_disable
> error = copy_to_user
> pagefault_enable
> preempt_enable_no_resched
>
> is not doing anything wrong and should not get a warning,
> as long as error is handled correctly later.
> Right?
>

What I see wrong with the above is the preempt_enable_no_resched(). The
only place that should be ever used is right before a schedule(), as you
don't want to schedule twice (once for the preempt_enable() and then
again for the schedule itself).

Remember, in -rt, a spin lock does not disable preemption.

-- Steve

2013-05-19 13:35:48

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, May 19, 2013 at 08:34:04AM -0400, Steven Rostedt wrote:
> On Sun, 2013-05-19 at 12:35 +0300, Michael S. Tsirkin wrote:
>
> > No, I was not assuming that. What I'm trying to say is that a caller
> > that does something like this under a spinlock:
> > preempt_disable
> > pagefault_disable
> > error = copy_to_user
> > pagefault_enable
> > preempt_enable_no_resched
> >
> > is not doing anything wrong and should not get a warning,
> > as long as error is handled correctly later.
> > Right?
> >
>
> What I see wrong with the above is the preempt_enable_no_resched(). The
> only place that should be ever used is right before a schedule(), as you
> don't want to schedule twice (once for the preempt_enable() and then
> again for the schedule itself).
>
> Remember, in -rt, a spin lock does not disable preemption.
>
> -- Steve

Right but we need to keep it working on upstream as well.
If I do preempt_enable under a spinlock upstream won't it
try to sleep under spinlock?


--
MST

2013-05-19 16:06:26

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, 2013-05-19 at 16:34 +0300, Michael S. Tsirkin wrote:

> Right but we need to keep it working on upstream as well.
> If I do preempt_enable under a spinlock upstream won't it
> try to sleep under spinlock?

No it wont. A spinlock calls preempt_disable implicitly, and a
preempt_enable() will not schedule unless preempt_count is zero, which
it wont be under a spinlock.

If it did, there would be lots of bugs all over the place because this
is done throughout the kernel (a preempt_enable() under a spinlock).

In other words, don't ever use preempt_enable_no_resched().

-- Steve

2013-05-19 16:41:49

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, May 19, 2013 at 12:06:19PM -0400, Steven Rostedt wrote:
> On Sun, 2013-05-19 at 16:34 +0300, Michael S. Tsirkin wrote:
>
> > Right but we need to keep it working on upstream as well.
> > If I do preempt_enable under a spinlock upstream won't it
> > try to sleep under spinlock?
>
> No it wont. A spinlock calls preempt_disable implicitly, and a
> preempt_enable() will not schedule unless preempt_count is zero, which
> it wont be under a spinlock.
>
> If it did, there would be lots of bugs all over the place because this
> is done throughout the kernel (a preempt_enable() under a spinlock).
>
> In other words, don't ever use preempt_enable_no_resched().
>
> -- Steve
>


OK I get it. So let me correct myself. The simple code
that does something like this under a spinlock:
> preempt_disable
> pagefault_disable
> error = copy_to_user
> pagefault_enable
> preempt_enable
>
is not doing anything wrong and should not get a warning,
as long as error is handled correctly later.
Right?

2013-05-19 20:23:37

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, 2013-05-19 at 19:40 +0300, Michael S. Tsirkin wrote:

> OK I get it. So let me correct myself. The simple code
> that does something like this under a spinlock:
> > preempt_disable
> > pagefault_disable
> > error = copy_to_user
> > pagefault_enable
> > preempt_enable
> >
> is not doing anything wrong and should not get a warning,
> as long as error is handled correctly later.
> Right?

I came in mid thread and I don't know the context. Anyway, the above
looks to me as you just don't want to sleep. If you try to copy data to
user space that happens not to be currently mapped for any reason, you
will get an error. Even if the address space is completely valid. Is
that what you want?

-- Steve

2013-05-19 20:39:55

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, May 19, 2013 at 04:23:22PM -0400, Steven Rostedt wrote:
> On Sun, 2013-05-19 at 19:40 +0300, Michael S. Tsirkin wrote:
>
> > OK I get it. So let me correct myself. The simple code
> > that does something like this under a spinlock:
> > > preempt_disable
> > > pagefault_disable
> > > error = copy_to_user
> > > pagefault_enable
> > > preempt_enable
> > >
> > is not doing anything wrong and should not get a warning,
> > as long as error is handled correctly later.
> > Right?
>
> I came in mid thread and I don't know the context.

The context is that I want to change might_fault
from might_sleep to
might_sleep_if(!in_atomic())
so that above does not trigger warnings even with
CONFIG_DEBUG_ATOMIC_SLEEP enabled.


> Anyway, the above
> looks to me as you just don't want to sleep.

Exactly. upstream we can just do pagefault_disable but to make
this code -rt ready it's best to do preempt_disable as well.

> If you try to copy data to
> user space that happens not to be currently mapped for any reason, you
> will get an error. Even if the address space is completely valid. Is
> that what you want?
>
> -- Steve
>

Yes, this is by design.
We detect that and bounce the work to a thread outside
any locks.

Thanks,

--
MST

2013-05-21 12:34:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, May 19, 2013 at 07:40:09PM +0300, Michael S. Tsirkin wrote:
> OK I get it. So let me correct myself. The simple code
> that does something like this under a spinlock:
> > preempt_disable
> > pagefault_disable
> > error = copy_to_user
> > pagefault_enable
> > preempt_enable
> >
> is not doing anything wrong and should not get a warning,
> as long as error is handled correctly later.
> Right?

Indeed, but I don't get the point of the preempt_{disable,enable}()
here. Why does it have to disable preemption explicitly here? I thought
all you wanted was to avoid the pagefault handler and make it do the
exception table thing; for that pagefault_disable() is sufficient.

2013-05-21 12:34:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, May 19, 2013 at 12:35:26PM +0300, Michael S. Tsirkin wrote:
> > > --- a/include/linux/kernel.h
> > > +++ b/include/linux/kernel.h
> > > @@ -198,7 +198,6 @@ void might_fault(void);
> > > #else
> > > static inline void might_fault(void)
> > > {
> > > - might_sleep();
> >
> > This removes potential resched points for PREEMPT_VOLUNTARY -- was that
> > intentional?
>
> No it's a bug. Thanks for pointing this out.
> OK so I guess it should be might_sleep_if(!in_atomic())
> and this means might_fault would have to move from linux/kernel.h to
> linux/uaccess.h, since in_atomic() is in linux/hardirq.h
>
> Makes sense?

So the only difference between PROVE_LOCKING and not should be the
might_lock_read() thing; so how about something like this?

---
include/linux/kernel.h | 7 ++-----
include/linux/uaccess.h | 26 ++++++++++++++++++++++++++
mm/memory.c | 14 ++------------
3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e96329c..70812f4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -194,12 +194,9 @@ extern int _cond_resched(void);
})

#ifdef CONFIG_PROVE_LOCKING
-void might_fault(void);
+void might_fault_lockdep(void);
#else
-static inline void might_fault(void)
-{
- might_sleep();
-}
+static inline void might_fault_lockdep(void) { }
#endif

extern struct atomic_notifier_head panic_notifier_list;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5ca0951..50a2cc9 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -38,6 +38,32 @@ static inline void pagefault_enable(void)
preempt_check_resched();
}

+static inline bool __can_fault(void)
+{
+ /*
+ * Some code (nfs/sunrpc) uses socket ops on kernel memory while
+ * holding the mmap_sem, this is safe because kernel memory doesn't
+ * get paged out, therefore we'll never actually fault, and the
+ * below annotations will generate false positives.
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ return false;
+
+ if (in_atomic() /* || pagefault_disabled() */)
+ return false;
+
+ return true;
+}
+
+static inline void might_fault(void)
+{
+ if (!__can_fault())
+ return;
+
+ might_sleep();
+ might_fault_lockdep();
+}
+
#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882..266610c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4211,19 +4211,9 @@ void print_vma_addr(char *prefix, unsigned long ip)
}

#ifdef CONFIG_PROVE_LOCKING
-void might_fault(void)
+void might_fault_lockdep(void)
{
/*
- * Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_sem, this is safe because kernel memory doesn't
- * get paged out, therefore we'll never actually fault, and the
- * below annotations will generate false positives.
- */
- if (segment_eq(get_fs(), KERNEL_DS))
- return;
-
- might_sleep();
- /*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
@@ -4231,7 +4221,7 @@ void might_fault(void)
if (!in_atomic() && current->mm)
might_lock_read(&current->mm->mmap_sem);
}
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(might_fault_lockdep);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)

2013-05-21 12:39:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Sun, May 19, 2013 at 12:35:26PM +0300, Michael S. Tsirkin wrote:
> On Thu, May 16, 2013 at 08:40:41PM +0200, Peter Zijlstra wrote:
> > On Thu, May 16, 2013 at 02:16:10PM +0300, Michael S. Tsirkin wrote:
> > > There are several ways to make sure might_fault
> > > calling function does not sleep.
> > > One is to use it on kernel or otherwise locked memory - apparently
> > > nfs/sunrpc does this. As noted by Ingo, this is handled by the
> > > migh_fault() implementation in mm/memory.c but not the one in
> > > linux/kernel.h so in the current code might_fault() schedules
> > > differently depending on CONFIG_PROVE_LOCKING, which is an undesired
> > > semantical side effect.
> > >
> > > Another is to call pagefault_disable: in this case the page fault
> > > handler will go to fixups processing and we get an error instead of
> > > sleeping, so the might_sleep annotation is a false positive.
> > > vhost driver wants to do this now in order to reuse socket ops
> > > under a spinlock (and fall back on slower thread handler
> > > on error).
> >
> > Are you using the assumption that spin_lock() implies preempt_disable() implies
> > pagefault_disable()? Note that this assumption isn't valid for -rt where the
> > spinlock becomes preemptible but we'll not disable pagefaults.
>
> No, I was not assuming that. What I'm trying to say is that a caller
> that does something like this under a spinlock:
> preempt_disable
> pagefault_disable
> error = copy_to_user
> pagefault_enable
> preempt_enable_no_resched
>
> is not doing anything wrong and should not get a warning,
> as long as error is handled correctly later.
> Right?

Aside from the no_resched() thing which Steven already explained and my
previous email asking why you need the preempt_disable() at all, that
should indeed work.

The reason I was asking was that I wasn't sure you weren't doing:

spin_lock(&my_lock);
error = copy_to_user();
spin_unlock(&my_lock);

and expecting the copy_to_user() to always take the exception table
route. This works on mainline (since spin_lock implies a preempt disable
and preempt_disable is the same as pagefault_disable). However as should
be clear by now, it doesn't quite work that way for -rt.

2013-05-22 09:26:52

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Thursday 16 May 2013, Michael S. Tsirkin wrote:
> This improves the might_fault annotations used
> by uaccess routines:
>
> 1. The only reason uaccess routines might sleep
> is if they fault. Make this explicit for
> all architectures.
> 2. Accesses (e.g through socket ops) to kernel memory
> with KERNEL_DS like net/sunrpc does will never sleep.
> Remove an unconditinal might_sleep in the inline
> might_fault in kernel.h
> (used when PROVE_LOCKING is not set).
> 3. Accesses with pagefault_disable return EFAULT
> but won't cause caller to sleep.
> Check for that and avoid might_sleep when
> PROVE_LOCKING is set.
>
> I'd like these changes to go in for the benefit of
> the vhost driver where we want to call socket ops
> under a spinlock, and fall back on slower thread handler
> on error.

Hi Michael,

I have recently stumbled over a related topic, which is the highly
inconsistent placement of might_fault() or might_sleep() in certain
classes of uaccess functions. Your patches seem completely reasonable,
but it would be good to also fix the other problem, at least on
the architectures we most care about.

Given the most commonly used functions and a couple of architectures
I'm familiar with, these are the ones that currently call might_fault()

x86-32 x86-64 arm arm64 powerpc s390 generic
copy_to_user - x - - - x x
copy_from_user - x - - - x x
put_user x x x x x x x
get_user x x x x x x x
__copy_to_user x x - - x - -
__copy_from_user x x - - x - -
__put_user - - x - x - -
__get_user - - x - x - -

WTF?

Calling might_fault() for every __get_user/__put_user is rather expensive
because it turns what should be a single instruction (plus fixup) into an
external function call.

My feeling is that we should do might_fault() only in access_ok() to get
the right balance.

Arnd

2013-05-22 09:48:38

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Tue, May 21, 2013 at 01:57:34PM +0200, Peter Zijlstra wrote:
> On Sun, May 19, 2013 at 12:35:26PM +0300, Michael S. Tsirkin wrote:
> > > > --- a/include/linux/kernel.h
> > > > +++ b/include/linux/kernel.h
> > > > @@ -198,7 +198,6 @@ void might_fault(void);
> > > > #else
> > > > static inline void might_fault(void)
> > > > {
> > > > - might_sleep();
> > >
> > > This removes potential resched points for PREEMPT_VOLUNTARY -- was that
> > > intentional?
> >
> > No it's a bug. Thanks for pointing this out.
> > OK so I guess it should be might_sleep_if(!in_atomic())
> > and this means might_fault would have to move from linux/kernel.h to
> > linux/uaccess.h, since in_atomic() is in linux/hardirq.h
> >
> > Makes sense?
>
> So the only difference between PROVE_LOCKING and not should be the
> might_lock_read() thing; so how about something like this?
>
> ---
> include/linux/kernel.h | 7 ++-----
> include/linux/uaccess.h | 26 ++++++++++++++++++++++++++
> mm/memory.c | 14 ++------------
> 3 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e96329c..70812f4 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -194,12 +194,9 @@ extern int _cond_resched(void);
> })
>
> #ifdef CONFIG_PROVE_LOCKING
> -void might_fault(void);
> +void might_fault_lockdep(void);
> #else
> -static inline void might_fault(void)
> -{
> - might_sleep();
> -}
> +static inline void might_fault_lockdep(void) { }
> #endif
>
> extern struct atomic_notifier_head panic_notifier_list;
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 5ca0951..50a2cc9 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -38,6 +38,32 @@ static inline void pagefault_enable(void)
> preempt_check_resched();
> }
>
> +static inline bool __can_fault(void)
> +{
> + /*
> + * Some code (nfs/sunrpc) uses socket ops on kernel memory while
> + * holding the mmap_sem, this is safe because kernel memory doesn't
> + * get paged out, therefore we'll never actually fault, and the
> + * below annotations will generate false positives.
> + */
> + if (segment_eq(get_fs(), KERNEL_DS))
> + return false;
> +
> + if (in_atomic() /* || pagefault_disabled() */)

One question here: I'm guessing you put this comment here
for illustrative purposes, implying code that will
be enabled in -rt?
We don't want it upstream I think, right?


> + return false;
> +
> + return true;
> +}
> +
> +static inline void might_fault(void)
> +{
> + if (!__can_fault())
> + return;
> +
> + might_sleep();
> + might_fault_lockdep();
> +}
> +
> #ifndef ARCH_HAS_NOCACHE_UACCESS
>
> static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
> diff --git a/mm/memory.c b/mm/memory.c
> index 6dc1882..266610c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4211,19 +4211,9 @@ void print_vma_addr(char *prefix, unsigned long ip)
> }
>
> #ifdef CONFIG_PROVE_LOCKING
> -void might_fault(void)
> +void might_fault_lockdep(void)
> {
> /*
> - * Some code (nfs/sunrpc) uses socket ops on kernel memory while
> - * holding the mmap_sem, this is safe because kernel memory doesn't
> - * get paged out, therefore we'll never actually fault, and the
> - * below annotations will generate false positives.
> - */
> - if (segment_eq(get_fs(), KERNEL_DS))
> - return;
> -
> - might_sleep();
> - /*
> * it would be nicer only to annotate paths which are not under
> * pagefault_disable, however that requires a larger audit and
> * providing helpers like get_user_atomic.
> @@ -4231,7 +4221,7 @@ void might_fault(void)
> if (!in_atomic() && current->mm)
> might_lock_read(&current->mm->mmap_sem);
> }
> -EXPORT_SYMBOL(might_fault);
> +EXPORT_SYMBOL(might_fault_lockdep);
> #endif
>
> #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)

2013-05-22 09:59:43

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> On Thursday 16 May 2013, Michael S. Tsirkin wrote:
> > This improves the might_fault annotations used
> > by uaccess routines:
> >
> > 1. The only reason uaccess routines might sleep
> > is if they fault. Make this explicit for
> > all architectures.
> > 2. Accesses (e.g through socket ops) to kernel memory
> > with KERNEL_DS like net/sunrpc does will never sleep.
> > Remove an unconditinal might_sleep in the inline
> > might_fault in kernel.h
> > (used when PROVE_LOCKING is not set).
> > 3. Accesses with pagefault_disable return EFAULT
> > but won't cause caller to sleep.
> > Check for that and avoid might_sleep when
> > PROVE_LOCKING is set.
> >
> > I'd like these changes to go in for the benefit of
> > the vhost driver where we want to call socket ops
> > under a spinlock, and fall back on slower thread handler
> > on error.
>
> Hi Michael,
>
> I have recently stumbled over a related topic, which is the highly
> inconsistent placement of might_fault() or might_sleep() in certain
> classes of uaccess functions. Your patches seem completely reasonable,
> but it would be good to also fix the other problem, at least on
> the architectures we most care about.
>
> Given the most commonly used functions and a couple of architectures
> I'm familiar with, these are the ones that currently call might_fault()
>
> x86-32 x86-64 arm arm64 powerpc s390 generic
> copy_to_user - x - - - x x
> copy_from_user - x - - - x x
> put_user x x x x x x x
> get_user x x x x x x x
> __copy_to_user x x - - x - -
> __copy_from_user x x - - x - -
> __put_user - - x - x - -
> __get_user - - x - x - -
>
> WTF?

Yea.

> Calling might_fault() for every __get_user/__put_user is rather expensive
> because it turns what should be a single instruction (plus fixup) into an
> external function call.

You mean _cond_resched with CONFIG_PREEMPT_VOLUNTARY? Or do you
mean when we build with PROVE_LOCKING?

> My feeling is that we should do might_fault() only in access_ok() to get
> the right balance.
>
> Arnd

Well access_ok is currently non-blocking I think - we'd have to audit
all callers. There are some 200 of these in drivers and some
1000 total so ... a bit risky.

--
MST

2013-05-22 10:16:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Wed, May 22, 2013 at 12:47:09PM +0300, Michael S. Tsirkin wrote:
> >
> > +static inline bool __can_fault(void)
> > +{
> > + /*
> > + * Some code (nfs/sunrpc) uses socket ops on kernel memory while
> > + * holding the mmap_sem, this is safe because kernel memory doesn't
> > + * get paged out, therefore we'll never actually fault, and the
> > + * below annotations will generate false positives.
> > + */
> > + if (segment_eq(get_fs(), KERNEL_DS))
> > + return false;
> > +
> > + if (in_atomic() /* || pagefault_disabled() */)
>
> One question here: I'm guessing you put this comment here
> for illustrative purposes, implying code that will
> be enabled in -rt?
> We don't want it upstream I think, right?

Right, and as a reminder that when we do this we need to add a patch to
-rt. But yeah, we should have a look and see if its worth pulling those
patches from -rt into mainline in some way shape or form. They're big
but trivial IIRC.

I'm fine with you leaving that comment out though..

2013-05-22 10:19:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> Calling might_fault() for every __get_user/__put_user is rather expensive
> because it turns what should be a single instruction (plus fixup) into an
> external function call.

We could hide it all behind CONFIG_DEBUG_ATOMIC_SLEEP just like
might_sleep() is. I'm not sure there's a point to might_fault() when
might_sleep() is a NOP.

2013-05-22 11:09:08

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wed, May 22, 2013 at 12:19:16PM +0200, Peter Zijlstra wrote:
> On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> > Calling might_fault() for every __get_user/__put_user is rather expensive
> > because it turns what should be a single instruction (plus fixup) into an
> > external function call.
>
> We could hide it all behind CONFIG_DEBUG_ATOMIC_SLEEP just like
> might_sleep() is. I'm not sure there's a point to might_fault() when
> might_sleep() is a NOP.

The patch that you posted gets pretty close.
E.g. I'm testing this now:
+#define might_fault() do { \
+ if (_might_fault()) \
+ __might_sleep(__FILE__, __LINE__, 0); \
+ might_resched(); \
+} while(0)

So if might_sleep is a NOP, __might_sleep and might_resched are NOPs
so compiler will optimize this all out.

However, in a related thread, you pointed out that might_sleep is not a NOP if
CONFIG_PREEMPT_VOLUNTARY is set, even without CONFIG_DEBUG_ATOMIC_SLEEP.

Do you think we should drop the preemption point in might_fault?
Only copy_XX_user?
Only __copy_XXX_user ?

--
MST

2013-05-22 11:28:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wed, May 22, 2013 at 02:07:29PM +0300, Michael S. Tsirkin wrote:
> On Wed, May 22, 2013 at 12:19:16PM +0200, Peter Zijlstra wrote:
> > On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> > > Calling might_fault() for every __get_user/__put_user is rather expensive
> > > because it turns what should be a single instruction (plus fixup) into an
> > > external function call.
> >
> > We could hide it all behind CONFIG_DEBUG_ATOMIC_SLEEP just like
> > might_sleep() is. I'm not sure there's a point to might_fault() when
> > might_sleep() is a NOP.
>
> The patch that you posted gets pretty close.
> E.g. I'm testing this now:
> +#define might_fault() do { \
> + if (_might_fault()) \
> + __might_sleep(__FILE__, __LINE__, 0); \
> + might_resched(); \
> +} while(0)
>
> So if might_sleep is a NOP, __might_sleep and might_resched are NOPs
> so compiler will optimize this all out.
>
> However, in a related thread, you pointed out that might_sleep is not a NOP if
> CONFIG_PREEMPT_VOLUNTARY is set, even without CONFIG_DEBUG_ATOMIC_SLEEP.

Oh crud yeah.. and you actually need that _might_fault() stuff for that
too. Bugger.

Yeah, I wouldn't know what the effects of dropping ita (for the copy
functions) would be, VOLUNTARY isn't a preemption mode I ever use (even
though it seems most distros default to it).

2013-05-22 13:44:14

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> Given the most commonly used functions and a couple of architectures
> I'm familiar with, these are the ones that currently call might_fault()
>
> x86-32 x86-64 arm arm64 powerpc s390 generic
> copy_to_user - x - - - x x
> copy_from_user - x - - - x x
> put_user x x x x x x x
> get_user x x x x x x x
> __copy_to_user x x - - x - -
> __copy_from_user x x - - x - -
> __put_user - - x - x - -
> __get_user - - x - x - -
>
> WTF?

I think your table is rather screwed - especially on ARM. Tell me -
how can __copy_to_user() use might_fault() but copy_to_user() not when
copy_to_user() is implemented using __copy_to_user() ? Same for
copy_from_user() but the reverse argument - there's nothing special
in our copy_from_user() which would make it do might_fault() when
__copy_from_user() wouldn't.

The correct position for ARM is: our (__)?(pu|ge)t_user all use
might_fault(), but (__)?copy_(to|from)_user do not. Neither does
(__)?clear_user. We might want to fix those to use might_fault().

2013-05-22 14:01:25

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 07/10] powerpc: uaccess s/might_sleep/might_fault/

On Thursday 16 May 2013, Michael S. Tsirkin wrote:
> @@ -178,7 +178,7 @@ do { \
> long __pu_err; \
> __typeof__(*(ptr)) __user *__pu_addr = (ptr); \
> if (!is_kernel_addr((unsigned long)__pu_addr)) \
> - might_sleep(); \
> + might_fault(); \
> __chk_user_ptr(ptr); \
> __put_user_size((x), __pu_addr, (size), __pu_err); \
> __pu_err; \
>

Another observation:

if (!is_kernel_addr((unsigned long)__pu_addr))
might_sleep();

is almost the same as

might_fault();

except that it does not call might_lock_read().

The version above may have been put there intentionally and correctly, but
if you want to replace it with might_fault(), you should remove the
"if ()" condition.

Arnd

2013-05-22 14:05:06

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wednesday 22 May 2013, Russell King - ARM Linux wrote:
> On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> > Given the most commonly used functions and a couple of architectures
> > I'm familiar with, these are the ones that currently call might_fault()
> >
> > x86-32 x86-64 arm arm64 powerpc s390 generic
> > copy_to_user - x - - - x x
> > copy_from_user - x - - - x x
> > put_user x x x x x x x
> > get_user x x x x x x x
> > __copy_to_user x x - - x - -
> > __copy_from_user x x - - x - -
> > __put_user - - x - x - -
> > __get_user - - x - x - -
> >
> > WTF?
>
> I think your table is rather screwed - especially on ARM. Tell me -
> how can __copy_to_user() use might_fault() but copy_to_user() not when
> copy_to_user() is implemented using __copy_to_user() ? Same for
> copy_from_user() but the reverse argument - there's nothing special
> in our copy_from_user() which would make it do might_fault() when
> __copy_from_user() wouldn't.

I think something went wrong with formatting of the tabstobs in
the table. I've tried to correct it above to the same version I
see on the mailing list.

> The correct position for ARM is: our (__)?(pu|ge)t_user all use
> might_fault(), but (__)?copy_(to|from)_user do not. Neither does
> (__)?clear_user. We might want to fix those to use might_fault().

Yes, that sounds like a good idea, especially since they are all
implemented out-of-line.

For __get_user()/__put_user(), I would probably do the reverse and make
them not call might_fault() though, like we do on most other architectures:

Look at the object code produced for setup_sigframe for instance, it calls
might_fault() around 25 times where one should really be enough. Using
__put_user() instead of put_user() is normally an indication that the
author of that function has made performance considerations and move the
(trivial) access_ok() call out, but now we add a more expensive
call instead.

Arnd

2013-05-22 14:32:21

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 07/10] powerpc: uaccess s/might_sleep/might_fault/

On Wed, May 22, 2013 at 03:59:01PM +0200, Arnd Bergmann wrote:
> On Thursday 16 May 2013, Michael S. Tsirkin wrote:
> > @@ -178,7 +178,7 @@ do { \
> > long __pu_err; \
> > __typeof__(*(ptr)) __user *__pu_addr = (ptr); \
> > if (!is_kernel_addr((unsigned long)__pu_addr)) \
> > - might_sleep(); \
> > + might_fault(); \
> > __chk_user_ptr(ptr); \
> > __put_user_size((x), __pu_addr, (size), __pu_err); \
> > __pu_err; \
> >
>
> Another observation:
>
> if (!is_kernel_addr((unsigned long)__pu_addr))
> might_sleep();
>
> is almost the same as
>
> might_fault();
>
> except that it does not call might_lock_read().
>
> The version above may have been put there intentionally and correctly, but
> if you want to replace it with might_fault(), you should remove the
> "if ()" condition.
>
> Arnd

Good point, thanks.
I think I'll do it in a separate patch on top,
just to make sure bisect does not result in a revision that
produces false positive warnings.

--
MST

2013-05-22 14:45:36

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 00/10] uaccess: better might_sleep/might_fault behavior

On Wed, May 22, 2013 at 04:04:48PM +0200, Arnd Bergmann wrote:
> On Wednesday 22 May 2013, Russell King - ARM Linux wrote:
> > On Wed, May 22, 2013 at 11:25:36AM +0200, Arnd Bergmann wrote:
> > > Given the most commonly used functions and a couple of architectures
> > > I'm familiar with, these are the ones that currently call might_fault()
> > >
> > > x86-32 x86-64 arm arm64 powerpc s390 generic
> > > copy_to_user - x - - - x x
> > > copy_from_user - x - - - x x
> > > put_user x x x x x x x
> > > get_user x x x x x x x
> > > __copy_to_user x x - - x - -
> > > __copy_from_user x x - - x - -
> > > __put_user - - x - x - -
> > > __get_user - - x - x - -
> > >
> > > WTF?
> >
> > I think your table is rather screwed - especially on ARM. Tell me -
> > how can __copy_to_user() use might_fault() but copy_to_user() not when
> > copy_to_user() is implemented using __copy_to_user() ? Same for
> > copy_from_user() but the reverse argument - there's nothing special
> > in our copy_from_user() which would make it do might_fault() when
> > __copy_from_user() wouldn't.
>
> I think something went wrong with formatting of the tabstobs in
> the table. I've tried to correct it above to the same version I
> see on the mailing list.
>
> > The correct position for ARM is: our (__)?(pu|ge)t_user all use
> > might_fault(), but (__)?copy_(to|from)_user do not. Neither does
> > (__)?clear_user. We might want to fix those to use might_fault().
>
> Yes, that sounds like a good idea, especially since they are all
> implemented out-of-line.
>
> For __get_user()/__put_user(), I would probably do the reverse and make
> them not call might_fault() though, like we do on most other architectures:
>
> Look at the object code produced for setup_sigframe for instance, it calls
> might_fault() around 25 times where one should really be enough.

Well it depends on what config options you set.
But with VOLUNTARY you are right.
Also, look at memcpy_fromiovec and weep.

> Using
> __put_user() instead of put_user() is normally an indication that the
> author of that function has made performance considerations and move the
> (trivial) access_ok() call out, but now we add a more expensive
> call instead.
>
> Arnd

I think exactly the same rules should apply to __XXX_user and
__copy_XXX_user - otherwise it's really confusing.

Maybe a preempt point in might_fault should go away?
Basically

#define might_fault() __might_sleep(__FILE__, __LINE__, 0)

Possibly adding the in_atomic() etc checks that Peter suggested.

Ingo, what do you think? And what testing would be appropriate
for such a change?


Thanks,

--
MST

2013-05-22 20:38:21

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Thu, May 16, 2013 at 08:40:41PM +0200, Peter Zijlstra wrote:
> On Thu, May 16, 2013 at 02:16:10PM +0300, Michael S. Tsirkin wrote:
> > There are several ways to make sure might_fault
> > calling function does not sleep.
> > One is to use it on kernel or otherwise locked memory - apparently
> > nfs/sunrpc does this. As noted by Ingo, this is handled by the
> > migh_fault() implementation in mm/memory.c but not the one in
> > linux/kernel.h so in the current code might_fault() schedules
> > differently depending on CONFIG_PROVE_LOCKING, which is an undesired
> > semantical side effect.
> >
> > Another is to call pagefault_disable: in this case the page fault
> > handler will go to fixups processing and we get an error instead of
> > sleeping, so the might_sleep annotation is a false positive.
> > vhost driver wants to do this now in order to reuse socket ops
> > under a spinlock (and fall back on slower thread handler
> > on error).
>
> Are you using the assumption that spin_lock() implies preempt_disable() implies
> pagefault_disable()? Note that this assumption isn't valid for -rt where the
> spinlock becomes preemptible but we'll not disable pagefaults.
>
> > Address both issues by:
> > - dropping the unconditional call to might_sleep
> > from the fast might_fault code in linux/kernel.h
> > - checking for pagefault_disable() in the
> > CONFIG_PROVE_LOCKING implementation
> >
> > Signed-off-by: Michael S. Tsirkin <[email protected]>
> > ---
> > include/linux/kernel.h | 1 -
> > mm/memory.c | 14 +++++++++-----
> > 2 files changed, 9 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e96329c..322b065 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -198,7 +198,6 @@ void might_fault(void);
> > #else
> > static inline void might_fault(void)
> > {
> > - might_sleep();
>
> This removes potential resched points for PREEMPT_VOLUNTARY -- was that
> intentional?

OK so I'm thinking of going back to this idea:
it has the advantage of being very simple,
and just might make some workloads faster
if they do lots of copy_XX_user in a loop.

Will have to be tested of course - anyone
has objections?

> > }
> > #endif
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 6dc1882..1b8327b 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4222,13 +4222,17 @@ void might_fault(void)
> > if (segment_eq(get_fs(), KERNEL_DS))
> > return;
> >
> > - might_sleep();
> > /*
> > - * it would be nicer only to annotate paths which are not under
> > - * pagefault_disable, however that requires a larger audit and
> > - * providing helpers like get_user_atomic.
> > + * It would be nicer to annotate paths which are under preempt_disable
> > + * but not under pagefault_disable, however that requires a new flag
> > + * for differentiating between the two.
>
> -rt has this, pagefault_disable() doesn't change the preempt count but pokes
> at task_struct::pagefault_disable.
>
> > */
> > - if (!in_atomic() && current->mm)
> > + if (in_atomic())
> > + return;
> > +
> > + might_sleep();
> > +
> > + if (current->mm)
> > might_lock_read(&current->mm->mmap_sem);
> > }
> > EXPORT_SYMBOL(might_fault);
> > --
> > MST

2013-05-22 20:39:51

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 10/10] kernel: might_fault does not imply might_sleep

On Tue, May 21, 2013 at 01:57:34PM +0200, Peter Zijlstra wrote:
> On Sun, May 19, 2013 at 12:35:26PM +0300, Michael S. Tsirkin wrote:
> > > > --- a/include/linux/kernel.h
> > > > +++ b/include/linux/kernel.h
> > > > @@ -198,7 +198,6 @@ void might_fault(void);
> > > > #else
> > > > static inline void might_fault(void)
> > > > {
> > > > - might_sleep();
> > >
> > > This removes potential resched points for PREEMPT_VOLUNTARY -- was that
> > > intentional?
> >
> > No it's a bug. Thanks for pointing this out.
> > OK so I guess it should be might_sleep_if(!in_atomic())
> > and this means might_fault would have to move from linux/kernel.h to
> > linux/uaccess.h, since in_atomic() is in linux/hardirq.h
> >
> > Makes sense?
>
> So the only difference between PROVE_LOCKING and not should be the
> might_lock_read() thing; so how about something like this?

So the problem with the below is that might_fault is needed
in asm/uaccess.h.
I'm still trying various approaches but the dependencies there
are very complex.

> ---
> include/linux/kernel.h | 7 ++-----
> include/linux/uaccess.h | 26 ++++++++++++++++++++++++++
> mm/memory.c | 14 ++------------
> 3 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e96329c..70812f4 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -194,12 +194,9 @@ extern int _cond_resched(void);
> })
>
> #ifdef CONFIG_PROVE_LOCKING
> -void might_fault(void);
> +void might_fault_lockdep(void);
> #else
> -static inline void might_fault(void)
> -{
> - might_sleep();
> -}
> +static inline void might_fault_lockdep(void) { }
> #endif
>
> extern struct atomic_notifier_head panic_notifier_list;
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 5ca0951..50a2cc9 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -38,6 +38,32 @@ static inline void pagefault_enable(void)
> preempt_check_resched();
> }
>
> +static inline bool __can_fault(void)
> +{
> + /*
> + * Some code (nfs/sunrpc) uses socket ops on kernel memory while
> + * holding the mmap_sem, this is safe because kernel memory doesn't
> + * get paged out, therefore we'll never actually fault, and the
> + * below annotations will generate false positives.
> + */
> + if (segment_eq(get_fs(), KERNEL_DS))
> + return false;
> +
> + if (in_atomic() /* || pagefault_disabled() */)
> + return false;
> +
> + return true;
> +}
> +
> +static inline void might_fault(void)
> +{
> + if (!__can_fault())
> + return;
> +
> + might_sleep();
> + might_fault_lockdep();
> +}
> +
> #ifndef ARCH_HAS_NOCACHE_UACCESS
>
> static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
> diff --git a/mm/memory.c b/mm/memory.c
> index 6dc1882..266610c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4211,19 +4211,9 @@ void print_vma_addr(char *prefix, unsigned long ip)
> }
>
> #ifdef CONFIG_PROVE_LOCKING
> -void might_fault(void)
> +void might_fault_lockdep(void)
> {
> /*
> - * Some code (nfs/sunrpc) uses socket ops on kernel memory while
> - * holding the mmap_sem, this is safe because kernel memory doesn't
> - * get paged out, therefore we'll never actually fault, and the
> - * below annotations will generate false positives.
> - */
> - if (segment_eq(get_fs(), KERNEL_DS))
> - return;
> -
> - might_sleep();
> - /*
> * it would be nicer only to annotate paths which are not under
> * pagefault_disable, however that requires a larger audit and
> * providing helpers like get_user_atomic.
> @@ -4231,7 +4221,7 @@ void might_fault(void)
> if (!in_atomic() && current->mm)
> might_lock_read(&current->mm->mmap_sem);
> }
> -EXPORT_SYMBOL(might_fault);
> +EXPORT_SYMBOL(might_fault_lockdep);
> #endif
>
> #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)

2013-05-24 13:02:32

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 07/10] powerpc: uaccess s/might_sleep/might_fault/

On Wed, May 22, 2013 at 03:59:01PM +0200, Arnd Bergmann wrote:
> On Thursday 16 May 2013, Michael S. Tsirkin wrote:
> > @@ -178,7 +178,7 @@ do { \
> > long __pu_err; \
> > __typeof__(*(ptr)) __user *__pu_addr = (ptr); \
> > if (!is_kernel_addr((unsigned long)__pu_addr)) \
> > - might_sleep(); \
> > + might_fault(); \
> > __chk_user_ptr(ptr); \
> > __put_user_size((x), __pu_addr, (size), __pu_err); \
> > __pu_err; \
> >
>
> Another observation:
>
> if (!is_kernel_addr((unsigned long)__pu_addr))
> might_sleep();
>
> is almost the same as
>
> might_fault();
>
> except that it does not call might_lock_read().
>
> The version above may have been put there intentionally and correctly, but
> if you want to replace it with might_fault(), you should remove the
> "if ()" condition.
>
> Arnd

Well not exactly. The non-inline might_fault checks the
current segment, not the address.
I'm guessing this is trying to do the same just without
pulling in segment_eq, but I'd like a confirmation
from more PPC maintainers.

Guys would you ack

- if (!is_kernel_addr((unsigned long)__pu_addr))
- might_fault();
+ might_fault();

on top of this patch?

Also, any volunteer to test this (not just test-build)?

--
MST

2013-05-24 13:12:28

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH v2 07/10] powerpc: uaccess s/might_sleep/might_fault/

On Fri, May 24, 2013 at 04:00:32PM +0300, Michael S. Tsirkin wrote:
> On Wed, May 22, 2013 at 03:59:01PM +0200, Arnd Bergmann wrote:
> > On Thursday 16 May 2013, Michael S. Tsirkin wrote:
> > > @@ -178,7 +178,7 @@ do { \
> > > long __pu_err; \
> > > __typeof__(*(ptr)) __user *__pu_addr = (ptr); \
> > > if (!is_kernel_addr((unsigned long)__pu_addr)) \
> > > - might_sleep(); \
> > > + might_fault(); \
> > > __chk_user_ptr(ptr); \
> > > __put_user_size((x), __pu_addr, (size), __pu_err); \
> > > __pu_err; \
> > >
> >
> > Another observation:
> >
> > if (!is_kernel_addr((unsigned long)__pu_addr))
> > might_sleep();
> >
> > is almost the same as
> >
> > might_fault();
> >
> > except that it does not call might_lock_read().
> >
> > The version above may have been put there intentionally and correctly, but
> > if you want to replace it with might_fault(), you should remove the
> > "if ()" condition.
> >
> > Arnd
>
> Well not exactly. The non-inline might_fault checks the
> current segment, not the address.
> I'm guessing this is trying to do the same just without
> pulling in segment_eq, but I'd like a confirmation
> from more PPC maintainers.
>
> Guys would you ack
>
> - if (!is_kernel_addr((unsigned long)__pu_addr))
> - might_fault();
> + might_fault();
>
> on top of this patch?

OK I spoke too fast: I found this:

powerpc: Fix incorrect might_sleep in __get_user/__put_user on kernel addresses

We have a case where __get_user and __put_user can validly be used
on kernel addresses in interrupt context - namely, the alignment
exception handler, as our get/put_unaligned just do a single access
and rely on the alignment exception handler to fix things up in the
rare cases where the cpu can't handle it in hardware. Thus we can
get alignment exceptions in the network stack at interrupt level.
The alignment exception handler does a __get_user to read the
instruction and blows up in might_sleep().

Since a __get_user on a kernel address won't actually ever sleep,
this makes the might_sleep conditional on the address being less
than PAGE_OFFSET.

Signed-off-by: Paul Mackerras <[email protected]>

So this won't work, unless we add the is_kernel_addr check
to might_fault. That will become possible on top of this patchset
but let's consider this carefully, and make this a separate
patchset, OK?

> Also, any volunteer to test this (not just test-build)?
>
> --
> MST

2013-05-24 13:33:06

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 07/10] powerpc: uaccess s/might_sleep/might_fault/

On Friday 24 May 2013, Michael S. Tsirkin wrote:
> So this won't work, unless we add the is_kernel_addr check
> to might_fault. That will become possible on top of this patchset
> but let's consider this carefully, and make this a separate
> patchset, OK?

Yes, makes sense.

Arnd

2013-05-24 14:17:27

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 01/11] asm-generic: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
include/asm-generic/uaccess.h | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index c184aa8..dc1269c 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -163,7 +163,7 @@ static inline __must_check long __copy_to_user(void __user *to,

#define put_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_WRITE, ptr, sizeof(*ptr)) ? \
__put_user(x, ptr) : \
-EFAULT; \
@@ -225,7 +225,7 @@ extern int __put_user_bad(void) __attribute__((noreturn));

#define get_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_READ, ptr, sizeof(*ptr)) ? \
__get_user(x, ptr) : \
-EFAULT; \
@@ -255,7 +255,7 @@ extern int __get_user_bad(void) __attribute__((noreturn));
static inline long copy_from_user(void *to,
const void __user * from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_READ, from, n))
return __copy_from_user(to, from, n);
else
@@ -265,7 +265,7 @@ static inline long copy_from_user(void *to,
static inline long copy_to_user(void __user *to,
const void *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
return __copy_to_user(to, from, n);
else
@@ -336,7 +336,7 @@ __clear_user(void __user *to, unsigned long n)
static inline __must_check unsigned long
clear_user(void __user *to, unsigned long n)
{
- might_sleep();
+ might_fault();
if (!access_ok(VERIFY_WRITE, to, n))
return n;

--
MST

2013-05-24 14:17:36

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 03/11] frv: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/frv/include/asm/uaccess.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index 0b67ec5..3ac9a59 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -280,14 +280,14 @@ extern long __memcpy_user(void *dst, const void *src, unsigned long count);
static inline unsigned long __must_check
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
- might_sleep();
+ might_fault();
return __copy_to_user_inatomic(to, from, n);
}

static inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
return __copy_from_user_inatomic(to, from, n);
}

--
MST

2013-05-24 14:17:55

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 04/11] m32r: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/m32r/include/asm/uaccess.h | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 1c7047b..84fe7ba 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -216,7 +216,7 @@ extern int fixup_exception(struct pt_regs *regs);
({ \
long __gu_err = 0; \
unsigned long __gu_val; \
- might_sleep(); \
+ might_fault(); \
__get_user_size(__gu_val,(ptr),(size),__gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
__gu_err; \
@@ -227,7 +227,7 @@ extern int fixup_exception(struct pt_regs *regs);
long __gu_err = -EFAULT; \
unsigned long __gu_val = 0; \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_READ,__gu_addr,size)) \
__get_user_size(__gu_val,__gu_addr,(size),__gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
@@ -295,7 +295,7 @@ do { \
#define __put_user_nocheck(x,ptr,size) \
({ \
long __pu_err; \
- might_sleep(); \
+ might_fault(); \
__put_user_size((x),(ptr),(size),__pu_err); \
__pu_err; \
})
@@ -305,7 +305,7 @@ do { \
({ \
long __pu_err = -EFAULT; \
__typeof__(*(ptr)) __user *__pu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_WRITE,__pu_addr,size)) \
__put_user_size((x),__pu_addr,(size),__pu_err); \
__pu_err; \
@@ -597,7 +597,7 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
*/
#define copy_to_user(to,from,n) \
({ \
- might_sleep(); \
+ might_fault(); \
__generic_copy_to_user((to),(from),(n)); \
})

@@ -638,7 +638,7 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
*/
#define copy_from_user(to,from,n) \
({ \
- might_sleep(); \
+ might_fault(); \
__generic_copy_from_user((to),(from),(n)); \
})

--
MST

2013-05-24 14:18:03

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 06/11] mn10300: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/mn10300/include/asm/uaccess.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 780560b..107508a 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -471,13 +471,13 @@ extern unsigned long __generic_copy_from_user(void *, const void __user *,

#define __copy_to_user(to, from, n) \
({ \
- might_sleep(); \
+ might_fault(); \
__copy_to_user_inatomic((to), (from), (n)); \
})

#define __copy_from_user(to, from, n) \
({ \
- might_sleep(); \
+ might_fault(); \
__copy_from_user_inatomic((to), (from), (n)); \
})

--
MST

2013-05-24 14:18:13

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 08/11] tile: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Acked-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/uaccess.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index 8a082bc..e4d44bd 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -442,7 +442,7 @@ extern unsigned long __copy_in_user_inatomic(
static inline unsigned long __must_check
__copy_in_user(void __user *to, const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
return __copy_in_user_inatomic(to, from, n);
}

--
MST

2013-05-24 14:18:22

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 10/11] kernel: drop voluntary schedule from might_fault

might_fault is called from functions like copy_to_user
which most callers expect to be very fast, like
a couple of instructions. So functions like memcpy_toiovec call them
many times in a loop.
But might_fault calls might_sleep() and with CONFIG_PREEMPT_VOLUNTARY
this results in a function call.

Let's not do this - just call __might_sleep that produces
a diagnostic for sleep within atomic, but drop
might_preempt().

Here's a test sending traffic between the VM and the host,
host is built with CONFIG_PREEMPT_VOLUNTARY:
Before:
incoming: 7122.77 Mb/s
outgoing: 8480.37 Mb/s
after:
incoming: 8619.24 Mb/s
outgoing: 9455.42 Mb/s

As a side effect, this fixes an issue pointed
out by Ingo: might_fault might schedule differently
depending on PROVE_LOCKING. Now there's no
preemption point in both cases, so it's consistent.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
include/linux/kernel.h | 2 +-
mm/memory.c | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e96329c..c514c06 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -198,7 +198,7 @@ void might_fault(void);
#else
static inline void might_fault(void)
{
- might_sleep();
+ __might_sleep(__FILE__, __LINE__, 0);
}
#endif

diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882..c1f190f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4222,7 +4222,8 @@ void might_fault(void)
if (segment_eq(get_fs(), KERNEL_DS))
return;

- might_sleep();
+ __might_sleep(__FILE__, __LINE__, 0);
+
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
--
MST

2013-05-24 14:18:21

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 09/11] x86: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/uaccess_64.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 142810c..4f7923d 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -235,7 +235,7 @@ extern long __copy_user_nocache(void *dst, const void __user *src,
static inline int
__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
{
- might_sleep();
+ might_fault();
return __copy_user_nocache(dst, src, size, 1);
}

--
MST

2013-05-24 14:18:49

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 11/11] kernel: uaccess in atomic with pagefault_disable

This changes might_fault so that it does not
trigger a false positive diagnostic for e.g. the following
sequence:
spin_lock_irqsave
pagefault_disable
copy_to_user
pagefault_enable
spin_unlock_irqrestore

In particular vhost wants to do this, to call
socket ops from under a lock.

There are 3 cases to consider:
CONFIG_PROVE_LOCKING - might_fault is non-inline
so it's easy to move the in_atomic test to fix
up the false positive warning.

CONFIG_DEBUG_ATOMIC_SLEEP - might_fault
is currently inline, but we are calling a
non-inline __might_sleep anyway,
so let's use the non-line version of might_fault
that does the right thing.

!CONFIG_DEBUG_ATOMIC_SLEEP && !CONFIG_PROVE_LOCKING
__might_sleep is a nop so might_fault is a nop.
Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
include/linux/kernel.h | 7 ++-----
mm/memory.c | 11 +++++++----
2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c514c06..0153be1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -193,13 +193,10 @@ extern int _cond_resched(void);
(__x < 0) ? -__x : __x; \
})

-#ifdef CONFIG_PROVE_LOCKING
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void might_fault(void);
#else
-static inline void might_fault(void)
-{
- __might_sleep(__FILE__, __LINE__, 0);
-}
+static inline void might_fault(void) { }
#endif

extern struct atomic_notifier_head panic_notifier_list;
diff --git a/mm/memory.c b/mm/memory.c
index c1f190f..d7d54a1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4210,7 +4210,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
up_read(&mm->mmap_sem);
}

-#ifdef CONFIG_PROVE_LOCKING
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void might_fault(void)
{
/*
@@ -4222,14 +4222,17 @@ void might_fault(void)
if (segment_eq(get_fs(), KERNEL_DS))
return;

- __might_sleep(__FILE__, __LINE__, 0);
-
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
- if (!in_atomic() && current->mm)
+ if (in_atomic())
+ return;
+
+ __might_sleep(__FILE__, __LINE__, 0);
+
+ if (current->mm)
might_lock_read(&current->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
--
MST

2013-05-24 14:19:06

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 07/11] powerpc: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Arnd Bergmann suggested that the following code
if (!is_kernel_addr((unsigned long)__pu_addr))
might_fault();
can be further simplified by adding a version of might_fault
that includes the kernel addr check.

Will be considered as a further optimization in future.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/powerpc/include/asm/uaccess.h | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 4db4959..9485b43 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -178,7 +178,7 @@ do { \
long __pu_err; \
__typeof__(*(ptr)) __user *__pu_addr = (ptr); \
if (!is_kernel_addr((unsigned long)__pu_addr)) \
- might_sleep(); \
+ might_fault(); \
__chk_user_ptr(ptr); \
__put_user_size((x), __pu_addr, (size), __pu_err); \
__pu_err; \
@@ -188,7 +188,7 @@ do { \
({ \
long __pu_err = -EFAULT; \
__typeof__(*(ptr)) __user *__pu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_WRITE, __pu_addr, size)) \
__put_user_size((x), __pu_addr, (size), __pu_err); \
__pu_err; \
@@ -268,7 +268,7 @@ do { \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
__chk_user_ptr(ptr); \
if (!is_kernel_addr((unsigned long)__gu_addr)) \
- might_sleep(); \
+ might_fault(); \
__get_user_size(__gu_val, __gu_addr, (size), __gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
__gu_err; \
@@ -282,7 +282,7 @@ do { \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
__chk_user_ptr(ptr); \
if (!is_kernel_addr((unsigned long)__gu_addr)) \
- might_sleep(); \
+ might_fault(); \
__get_user_size(__gu_val, __gu_addr, (size), __gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
__gu_err; \
@@ -294,7 +294,7 @@ do { \
long __gu_err = -EFAULT; \
unsigned long __gu_val = 0; \
const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
- might_sleep(); \
+ might_fault(); \
if (access_ok(VERIFY_READ, __gu_addr, (size))) \
__get_user_size(__gu_val, __gu_addr, (size), __gu_err); \
(x) = (__typeof__(*(ptr)))__gu_val; \
@@ -419,14 +419,14 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
static inline unsigned long __copy_from_user(void *to,
const void __user *from, unsigned long size)
{
- might_sleep();
+ might_fault();
return __copy_from_user_inatomic(to, from, size);
}

static inline unsigned long __copy_to_user(void __user *to,
const void *from, unsigned long size)
{
- might_sleep();
+ might_fault();
return __copy_to_user_inatomic(to, from, size);
}

@@ -434,7 +434,7 @@ extern unsigned long __clear_user(void __user *addr, unsigned long size);

static inline unsigned long clear_user(void __user *addr, unsigned long size)
{
- might_sleep();
+ might_fault();
if (likely(access_ok(VERIFY_WRITE, addr, size)))
return __clear_user(addr, size);
if ((unsigned long)addr < TASK_SIZE) {
--
MST

2013-05-24 14:17:53

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 05/11] microblaze: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/microblaze/include/asm/uaccess.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index efe59d8..2fc8bf7 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -145,7 +145,7 @@ static inline unsigned long __must_check __clear_user(void __user *to,
static inline unsigned long __must_check clear_user(void __user *to,
unsigned long n)
{
- might_sleep();
+ might_fault();
if (unlikely(!access_ok(VERIFY_WRITE, to, n)))
return n;

@@ -371,7 +371,7 @@ extern long __user_bad(void);
static inline long copy_from_user(void *to,
const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_READ, from, n))
return __copy_from_user(to, from, n);
return n;
@@ -385,7 +385,7 @@ static inline long copy_from_user(void *to,
static inline long copy_to_user(void __user *to,
const void *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
return __copy_to_user(to, from, n);
return n;
--
MST

2013-05-24 14:17:52

by Michael S. Tsirkin

[permalink] [raw]
Subject: [PATCH v3 02/11] arm64: uaccess s/might_sleep/might_fault/

The only reason uaccess routines might sleep
is if they fault. Make this explicit.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Acked-by: Catalin Marinas <[email protected]>
---
arch/arm64/include/asm/uaccess.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 008f848..edb3d5c 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -166,7 +166,7 @@ do { \

#define get_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_READ, (ptr), sizeof(*(ptr))) ? \
__get_user((x), (ptr)) : \
((x) = 0, -EFAULT); \
@@ -227,7 +227,7 @@ do { \

#define put_user(x, ptr) \
({ \
- might_sleep(); \
+ might_fault(); \
access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \
__put_user((x), (ptr)) : \
-EFAULT; \
--
MST