Hi all,
this series start cleaning up the safe kernel and user memory probing
helpers in mm/maccess.c, and then allows architectures to implement
the kernel probing without overriding the address space limit and
temporarily allowing access to user memory. It then switches x86
over to this new mechanism by reusing the unsafe_* uaccess logic.
This version also switches to the saner copy_{from,to}_kernel_nofault
naming suggested by Linus.
I kept the x86 helprs as-is without calling unsage_{get,put}_user as
that avoids a number of hard to trace casts, and it will still work
with the asm-goto based version easily.
Provide arch_kernel_read and arch_kernel_write routines to implement the
maccess routines without messing with set_fs and without stac/clac that
opens up access to user space.
Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/x86/include/asm/uaccess.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d8f283b9a569c..765e18417b3ba 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -523,5 +523,21 @@ do { \
unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
} while (0)
+#define HAVE_ARCH_PROBE_KERNEL
+
+#define arch_kernel_read(dst, src, type, err_label) \
+do { \
+ int __kr_err; \
+ \
+ __get_user_size(*((type *)dst), (__force type __user *)src, \
+ sizeof(type), __kr_err); \
+ if (unlikely(__kr_err)) \
+ goto err_label; \
+} while (0)
+
+#define arch_kernel_write(dst, src, type, err_label) \
+ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \
+ sizeof(type), err_label)
+
#endif /* _ASM_X86_UACCESS_H */
--
2.26.2
maccess tends to define lots of underscore prefixed symbols that then
have other weak aliases. But except for two cases they are never
actually used, so remove them.
Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/uaccess.h | 3 ---
mm/maccess.c | 19 +++----------------
2 files changed, 3 insertions(+), 19 deletions(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 67f016010aad5..a2c606a403745 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -324,7 +324,6 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
* happens, handle that and return -EFAULT.
*/
extern long probe_user_read(void *dst, const void __user *src, size_t size);
-extern long __probe_user_read(void *dst, const void __user *src, size_t size);
/*
* probe_kernel_write(): safely attempt to write to a location
@@ -336,7 +335,6 @@ extern long __probe_user_read(void *dst, const void __user *src, size_t size);
* happens, handle that and return -EFAULT.
*/
extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
-extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
/*
* probe_user_write(): safely attempt to write to a location in user space
@@ -348,7 +346,6 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size
* happens, handle that and return -EFAULT.
*/
extern long notrace probe_user_write(void __user *dst, const void *src, size_t size);
-extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size);
extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
diff --git a/mm/maccess.c b/mm/maccess.c
index cf21e604f78cb..4e7f3b6eb05ae 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -79,11 +79,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely read from user address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-
-long __weak probe_user_read(void *dst, const void __user *src, size_t size)
- __attribute__((alias("__probe_user_read")));
-
-long __probe_user_read(void *dst, const void __user *src, size_t size)
+long probe_user_read(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
mm_segment_t old_fs = get_fs();
@@ -106,11 +102,7 @@ EXPORT_SYMBOL_GPL(probe_user_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-
-long __weak probe_kernel_write(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_write")));
-
-long __probe_kernel_write(void *dst, const void *src, size_t size)
+long probe_kernel_write(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
@@ -131,11 +123,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-
-long __weak probe_user_write(void __user *dst, const void *src, size_t size)
- __attribute__((alias("__probe_user_write")));
-
-long __probe_user_write(void __user *dst, const void *src, size_t size)
+long probe_user_write(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
mm_segment_t old_fs = get_fs();
@@ -171,7 +159,6 @@ long __probe_user_write(void __user *dst, const void *src, size_t size)
* probing memory on a user address range where strncpy_from_unsafe_user() is
* supposed to be used instead.
*/
-
long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
__attribute__((alias("__strncpy_from_unsafe")));
--
2.26.2
These two functions are not used by any modular code.
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/maccess.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/mm/maccess.c b/mm/maccess.c
index 3ca8d97e50106..cf21e604f78cb 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -121,7 +121,6 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
return ret;
}
-EXPORT_SYMBOL_GPL(probe_kernel_write);
/**
* probe_user_write(): safely attempt to write to a user-space location
@@ -148,7 +147,6 @@ long __probe_user_write(void __user *dst, const void *src, size_t size)
return ret;
}
-EXPORT_SYMBOL_GPL(probe_user_write);
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
--
2.26.2
Add proper kerneldoc comments for probe_kernel_read_strict and
probe_kernel_read strncpy_from_unsafe_strict and explain the different
versus the non-strict version.
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/maccess.c | 61 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 43 insertions(+), 18 deletions(-)
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e7f3b6eb05ae..747581ac50dc9 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -31,29 +31,35 @@ probe_write_common(void __user *dst, const void *src, size_t size)
}
/**
- * probe_kernel_read(): safely attempt to read from a kernel-space location
+ * probe_kernel_read(): safely attempt to read from any location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
*
- * Safely read from address @src to the buffer at @dst. If a kernel fault
- * happens, handle that and return -EFAULT.
+ * Same as probe_kernel_read_strict() except that for architectures with
+ * not fully separated user and kernel address spaces this function also works
+ * for user address tanges.
+ *
+ * DO NOT USE THIS FUNCTION - it is broken on architectures with entirely
+ * separate kernel and user address spaces, and also a bad idea otherwise.
+ */
+long __weak probe_kernel_read(void *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
+/**
+ * probe_kernel_read_strict(): safely attempt to read from kernel-space
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from kernel address @src to the buffer at @dst. If a kernel
+ * fault happens, handle that and return -EFAULT.
*
* We ensure that the copy_from_user is executed in atomic context so that
* do_page_fault() doesn't attempt to take mmap_sem. This makes
* probe_kernel_read() suitable for use within regions where the caller
* already holds mmap_sem, or other locks which nest inside mmap_sem.
- *
- * probe_kernel_read_strict() is the same as probe_kernel_read() except for
- * the case where architectures have non-overlapping user and kernel address
- * ranges: probe_kernel_read_strict() will additionally return -EFAULT for
- * probing memory on a user address range where probe_user_read() is supposed
- * to be used instead.
*/
-
-long __weak probe_kernel_read(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_read")));
-
long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
@@ -153,15 +159,34 @@ long probe_user_write(void __user *dst, const void *src, size_t size)
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
*
- * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
- * for the case where architectures have non-overlapping user and kernel address
- * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
- * probing memory on a user address range where strncpy_from_unsafe_user() is
- * supposed to be used instead.
+ * Same as strncpy_from_unsafe_strict() except that for architectures with
+ * not fully separated user and kernel address spaces this function also works
+ * for user address tanges.
+ *
+ * DO NOT USE THIS FUNCTION - it is broken on architectures with entirely
+ * separate kernel and user address spaces, and also a bad idea otherwise.
*/
long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
__attribute__((alias("__strncpy_from_unsafe")));
+/**
+ * strncpy_from_unsafe_strict: - Copy a NUL terminated string from unsafe
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
long count)
__attribute__((alias("__strncpy_from_unsafe")));
--
2.26.2
On Wed, May 13, 2020 at 9:00 AM Christoph Hellwig <[email protected]> wrote:
>
> this series start cleaning up the safe kernel and user memory probing
> helpers in mm/maccess.c, and then allows architectures to implement
> the kernel probing without overriding the address space limit and
> temporarily allowing access to user memory. It then switches x86
> over to this new mechanism by reusing the unsafe_* uaccess logic.
Ok, I think I found a bug, and I had one more suggestion, but other
than the two emails I sent this all looks like an improvement to me.
Linus
On 5/13/20 6:00 PM, Christoph Hellwig wrote:
> Hi all,
>
> this series start cleaning up the safe kernel and user memory probing
> helpers in mm/maccess.c, and then allows architectures to implement
> the kernel probing without overriding the address space limit and
> temporarily allowing access to user memory. It then switches x86
> over to this new mechanism by reusing the unsafe_* uaccess logic.
>
> This version also switches to the saner copy_{from,to}_kernel_nofault
> naming suggested by Linus.
>
> I kept the x86 helprs as-is without calling unsage_{get,put}_user as
> that avoids a number of hard to trace casts, and it will still work
> with the asm-goto based version easily.
Aside from comments on list, the series looks reasonable to me. For BPF
the bpf_probe_read() helper would be slightly penalized for probing user
memory given we now test on copy_from_kernel_nofault() first and if that
fails only then fall back to copy_from_user_nofault(), but it seems
small enough that it shouldn't matter too much and aside from that we have
the newer bpf_probe_read_kernel() and bpf_probe_read_user() anyway that
BPF progs should use instead, so I think it's okay.
For patch 14 and patch 15, do you roughly know the performance gain with
the new probe_kernel_read_loop() + arch_kernel_read() approach?
Thanks,
Daniel
On Wed, May 13, 2020 at 4:04 PM Daniel Borkmann <[email protected]> wrote:
>
> Aside from comments on list, the series looks reasonable to me. For BPF
> the bpf_probe_read() helper would be slightly penalized for probing user
> memory given we now test on copy_from_kernel_nofault() first and if that
> fails only then fall back to copy_from_user_nofault(),
Again, no.
If you can't tell that one or the other is always the right thing,
then that function is simply buggy and wrong.
On sparc and on s390, address X can be _both_ a kernel address and a
user address. You need to specify which it is (by using the proper
function). The whole "try one first, then the other" doesn't work.
They may both "work", and by virtue of that, unless you can state
"yes, we always want user space" or "yes, we always want kernel", that
"try one or the other" isn't valid.
And it can be a real security issue. If a user program can be made to
read kernel memory when BPF validated things as a user pointer, it's
an obvious security issue.
But it can be a security issue the other way around too: if the BPF
code expects to get a kernel string, but user space can fool it into
reading a user string instead by mapping something of its own into the
user space address that aliases the kernel space address, then you can
presumably fool the BPF program to do bad things too (eg mess up any
BPF packet switching routines?).
So BPF really really really needs to specify which one it is. Not
specifying it and saying "whichever" is a bug, and a security issue.
Linus
On Thu, May 14, 2020 at 01:04:38AM +0200, Daniel Borkmann wrote:
> Aside from comments on list, the series looks reasonable to me. For BPF
> the bpf_probe_read() helper would be slightly penalized for probing user
> memory given we now test on copy_from_kernel_nofault() first and if that
> fails only then fall back to copy_from_user_nofault(), but it seems
> small enough that it shouldn't matter too much and aside from that we have
> the newer bpf_probe_read_kernel() and bpf_probe_read_user() anyway that
> BPF progs should use instead, so I think it's okay.
>
> For patch 14 and patch 15, do you roughly know the performance gain with
> the new probe_kernel_read_loop() + arch_kernel_read() approach?
I don't think there should be any measurable difference in performance
for typical use cases. We'll save the stac/clac pair, but that's it.
The real eason is to avoid that stac/clac pair that opens up a window
for explots, and as a significant enabler for killing of set_fs based
address limit overrides entirely.