LinuxLists.cc - get rid of the address_space override in setsockopt v2

2020-07-23 06:13:20

Subject: get rid of the address_space override in setsockopt v2

2020-07-23 06:13:48

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 07/26] net: switch sock_set_timeout to sockptr_t

Pass a sockptr_t to prepare for set_fs-less handling of the kernel
pointer from bpf-cgroup.

Signed-off-by: Christoph Hellwig <[email protected]>
---
net/core/sock.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 5b55bc9397f282..8b9eddaff868a5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -361,7 +361,8 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
return sizeof(tv);
}

-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
+static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
+ bool old_timeval)
{
struct __kernel_sock_timeval tv;

@@ -371,7 +372,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool
if (optlen < sizeof(tv32))
return -EINVAL;

- if (copy_from_user(&tv32, optval, sizeof(tv32)))
+ if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
return -EFAULT;
tv.tv_sec = tv32.tv_sec;
tv.tv_usec = tv32.tv_usec;
@@ -380,14 +381,14 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool

if (optlen < sizeof(old_tv))
return -EINVAL;
- if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
+ if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
return -EFAULT;
tv.tv_sec = old_tv.tv_sec;
tv.tv_usec = old_tv.tv_usec;
} else {
if (optlen < sizeof(tv))
return -EINVAL;
- if (copy_from_user(&tv, optval, sizeof(tv)))
+ if (copy_from_sockptr(&tv, optval, sizeof(tv)))
return -EFAULT;
}
if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
@@ -1051,12 +1052,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,

case SO_RCVTIMEO_OLD:
case SO_RCVTIMEO_NEW:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, USER_SOCKPTR(optval),
+ optlen, optname == SO_RCVTIMEO_OLD);
break;

case SO_SNDTIMEO_OLD:
case SO_SNDTIMEO_NEW:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
+ ret = sock_set_timeout(&sk->sk_sndtimeo, USER_SOCKPTR(optval),
+ optlen, optname == SO_SNDTIMEO_OLD);
break;

case SO_ATTACH_FILTER: {
--
2.27.0

2020-07-23 06:13:59

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 02/26] net/bpfilter: split __bpfilter_process_sockopt

Split __bpfilter_process_sockopt into a low-level send request routine and
the actual setsockopt hook to split the init time ping from the actual
setsockopt processing.

Signed-off-by: Christoph Hellwig <[email protected]>
---
net/bpfilter/bpfilter_kern.c | 51 +++++++++++++++++++-----------------
1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 3bac5820062af1..78d561f2c54da7 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -31,48 +31,51 @@ static void __stop_umh(void)
shutdown_umh();
}

-static int __bpfilter_process_sockopt(struct sock *sk, int optname,
- char __user *optval,
- unsigned int optlen, bool is_set)
+static int bpfilter_send_req(struct mbox_request *req)
{
- struct mbox_request req;
struct mbox_reply reply;
loff_t pos;
ssize_t n;
- int ret = -EFAULT;

- req.is_set = is_set;
- req.pid = current->pid;
- req.cmd = optname;
- req.addr = (uintptr_t)optval;
- req.len = optlen;
if (!bpfilter_ops.info.tgid)
- goto out;
+ return -EFAULT;
pos = 0;
- n = kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req),
+ n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req),
&pos);
- if (n != sizeof(req)) {
+ if (n != sizeof(*req)) {
pr_err("write fail %zd\n", n);
- __stop_umh();
- ret = -EFAULT;
- goto out;
+ goto stop;
}
pos = 0;
n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply),
&pos);
if (n != sizeof(reply)) {
pr_err("read fail %zd\n", n);
- __stop_umh();
- ret = -EFAULT;
- goto out;
+ goto stop;
}
- ret = reply.status;
-out:
- return ret;
+ return reply.status;
+stop:
+ __stop_umh();
+ return -EFAULT;
+}
+
+static int bpfilter_process_sockopt(struct sock *sk, int optname,
+ char __user *optval, unsigned int optlen,
+ bool is_set)
+{
+ struct mbox_request req = {
+ .is_set = is_set,
+ .pid = current->pid,
+ .cmd = optname,
+ .addr = (uintptr_t)optval,
+ .len = optlen,
+ };
+ return bpfilter_send_req(&req);
}

static int start_umh(void)
{
+ struct mbox_request req = { .pid = current->pid };
int err;

/* fork usermode process */
@@ -82,7 +85,7 @@ static int start_umh(void)
pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));

/* health check that usermode process started correctly */
- if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
+ if (bpfilter_send_req(&req) != 0) {
shutdown_umh();
return -EFAULT;
}
@@ -103,7 +106,7 @@ static int __init load_umh(void)
mutex_lock(&bpfilter_ops.lock);
err = start_umh();
if (!err && IS_ENABLED(CONFIG_INET)) {
- bpfilter_ops.sockopt = &__bpfilter_process_sockopt;
+ bpfilter_ops.sockopt = &bpfilter_process_sockopt;
bpfilter_ops.start = &start_umh;
}
mutex_unlock(&bpfilter_ops.lock);
--
2.27.0

2020-07-23 06:14:10

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 04/26] net: add a new sockptr_t type

Add a uptr_t type that can hold a pointer to either a user or kernel
memory region, and simply helpers to copy to and from it.

Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/sockptr.h | 104 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 104 insertions(+)
create mode 100644 include/linux/sockptr.h

diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
new file mode 100644
index 00000000000000..700856e13ea0c4
--- /dev/null
+++ b/include/linux/sockptr.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020 Christoph Hellwig.
+ *
+ * Support for "universal" pointers that can point to either kernel or userspace
+ * memory.
+ */
+#ifndef _LINUX_SOCKPTR_H
+#define _LINUX_SOCKPTR_H
+
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+typedef struct {
+ union {
+ void *kernel;
+ void __user *user;
+ };
+ bool is_kernel : 1;
+} sockptr_t;
+
+static inline bool sockptr_is_kernel(sockptr_t sockptr)
+{
+ return sockptr.is_kernel;
+}
+
+static inline sockptr_t KERNEL_SOCKPTR(void *p)
+{
+ return (sockptr_t) { .kernel = p, .is_kernel = true };
+}
+
+static inline sockptr_t USER_SOCKPTR(void __user *p)
+{
+ return (sockptr_t) { .user = p };
+}
+
+static inline bool sockptr_is_null(sockptr_t sockptr)
+{
+ return !sockptr.user && !sockptr.kernel;
+}
+
+static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
+{
+ if (!sockptr_is_kernel(src))
+ return copy_from_user(dst, src.user, size);
+ memcpy(dst, src.kernel, size);
+ return 0;
+}
+
+static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
+{
+ if (!sockptr_is_kernel(dst))
+ return copy_to_user(dst.user, src, size);
+ memcpy(dst.kernel, src, size);
+ return 0;
+}
+
+static inline void *memdup_sockptr(sockptr_t src, size_t len)
+{
+ void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ if (copy_from_sockptr(p, src, len)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+ return p;
+}
+
+static inline void *memdup_sockptr_nul(sockptr_t src, size_t len)
+{
+ char *p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ if (copy_from_sockptr(p, src, len)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+ p[len] = '\0';
+ return p;
+}
+
+static inline void sockptr_advance(sockptr_t sockptr, size_t len)
+{
+ if (sockptr_is_kernel(sockptr))
+ sockptr.kernel += len;
+ else
+ sockptr.user += len;
+}
+
+static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
+{
+ if (sockptr_is_kernel(src)) {
+ size_t len = min(strnlen(src.kernel, count - 1) + 1, count);
+
+ memcpy(dst, src.kernel, len);
+ return len;
+ }
+ return strncpy_from_user(dst, src.user, count);
+}
+
+#endif /* _LINUX_SOCKPTR_H */
--
2.27.0

2020-07-23 06:14:24

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 03/26] bpfilter: reject kernel addresses

The bpfilter user mode helper processes the optval address using
process_vm_readv. Don't send it kernel addresses fed under
set_fs(KERNEL_DS) as that won't work.

Signed-off-by: Christoph Hellwig <[email protected]>
---
net/bpfilter/bpfilter_kern.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 78d561f2c54da7..00540457e5f4d3 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -70,6 +70,10 @@ static int bpfilter_process_sockopt(struct sock *sk, int optname,
.addr = (uintptr_t)optval,
.len = optlen,
};
+ if (uaccess_kernel()) {
+ pr_err("kernel access not supported\n");
+ return -EFAULT;
+ }
return bpfilter_send_req(&req);
}

--
2.27.0

2020-07-23 14:42:56

by David Laight

[permalink] [raw]

Subject: RE: [PATCH 03/26] bpfilter: reject kernel addresses

From: Christoph Hellwig
> Sent: 23 July 2020 07:09
>
> The bpfilter user mode helper processes the optval address using
> process_vm_readv. Don't send it kernel addresses fed under
> set_fs(KERNEL_DS) as that won't work.

What sort of operations is the bpf filter doing on the sockopt buffers?

Any attempts to reject some requests can be thwarted by a second
application thread modifying the buffer after the bpf filter has
checked that it allowed.

You can't do security by reading a user buffer twice.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2020-07-23 14:45:52

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH 03/26] bpfilter: reject kernel addresses

On Thu, Jul 23, 2020 at 02:42:11PM +0000, David Laight wrote:
> From: Christoph Hellwig
> > Sent: 23 July 2020 07:09
> >
> > The bpfilter user mode helper processes the optval address using
> > process_vm_readv. Don't send it kernel addresses fed under
> > set_fs(KERNEL_DS) as that won't work.
>
> What sort of operations is the bpf filter doing on the sockopt buffers?
>
> Any attempts to reject some requests can be thwarted by a second
> application thread modifying the buffer after the bpf filter has
> checked that it allowed.
>
> You can't do security by reading a user buffer twice.

I'm not saying that I approve of the design, but the current bpfilter
design uses process_vm_readv to access the buffer, which obviously does
not work with kernel buffers.

2020-07-24 22:44:21

by David Miller

[permalink] [raw]

Subject: Re: get rid of the address_space override in setsockopt v2

From: Christoph Hellwig <[email protected]>
Date: Thu, 23 Jul 2020 08:08:42 +0200

> setsockopt is the last place in architecture-independ code that still
> uses set_fs to force the uaccess routines to operate on kernel pointers.
>
> This series adds a new sockptr_t type that can contained either a kernel
> or user pointer, and which has accessors that do the right thing, and
> then uses it for setsockopt, starting by refactoring some low-level
> helpers and moving them over to it before finally doing the main
> setsockopt method.
>
> Note that apparently the eBPF selftests do not even cover this path, so
> the series has been tested with a testing patch that always copies the
> data first and passes a kernel pointer. This is something that works for
> most common sockopts (and is something that the ePBF support relies on),
> but unfortunately in various corner cases we either don't use the passed
> in length, or in one case actually copy data back from setsockopt, or in
> case of bpfilter straight out do not work with kernel pointers at all.
>
> Against net-next/master.
>
> Changes since v1:
> - check that users don't pass in kernel addresses
> - more bpfilter cleanups
> - cosmetic mptcp tweak

Series applied to net-next, I'm build testing and will push this out when
that is done.

Thanks.

2020-07-26 07:09:30

by Andreas Schwab

[permalink] [raw]

Subject: Re: get rid of the address_space override in setsockopt v2

On Jul 26 2020, Christoph Hellwig wrote:

> From 6601732f7a54db5f04efba08f7e9224e5b757112 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <[email protected]>
> Date: Sun, 26 Jul 2020 09:00:09 +0200
> Subject: mISDN: remove a debug printk in data_sock_setsockopt
>
> The %p won't work with the new sockptr_t type. But in the times of
> ftrace, bpftrace and co these kinds of debug printks are pretty anyway,

I think there is a word missing after pretty.

Andreas.

--
Andreas Schwab, [email protected]
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
"And now for something completely different."

2020-07-26 07:47:25

by David Miller

[permalink] [raw]

Subject: Re: get rid of the address_space override in setsockopt v2

From: Christoph Hellwig <[email protected]>
Date: Sun, 26 Jul 2020 09:03:11 +0200

> On Fri, Jul 24, 2020 at 03:43:42PM -0700, David Miller wrote:
>> > Changes since v1:
>> > - check that users don't pass in kernel addresses
>> > - more bpfilter cleanups
>> > - cosmetic mptcp tweak
>>
>> Series applied to net-next, I'm build testing and will push this out when
>> that is done.
>
> The buildbot found one warning with the isdn debug code after a few
> days, here is what I think is the best fix:

I already fixed this in net-next.

2020-07-27 14:10:43

by David Laight

[permalink] [raw]

Subject: RE: get rid of the address_space override in setsockopt v2

From: Al Viro
> Sent: 27 July 2020 14:48
>
> On Mon, Jul 27, 2020 at 09:51:45AM +0000, David Laight wrote:
>
> > I'm sure there is code that processes options in chunks.
> > This probably means it is possible to put a chunk boundary
> > at the end of userspace and continue processing the very start
> > of kernel memory.
> >
> > At best this faults on the kernel copy code and crashes the system.
>
> Really? Care to provide some details, or is it another of your "I can't
> be possibly arsed to check what I'm saying, but it stands for reason
> that..." specials?

I did more 'homework' than sometimes :-)
Slightly difficult without a searchable net-next tree.
However, as has been pointed out is a different thread
this code is used to update IPv6 flow labels:

> > - if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen))
> > + sockptr_advance(optval, CMSG_ALIGN(sizeof(*freq)));
> > + if (copy_from_sockptr(fl->opt + 1, optval, olen))
> > goto done;

and doesn't work because the advances are no longer cumulative.

Now access_ok() has to take the base address and length to stop
'running into' kernel space, but the code above can advance from
a valid user pointer (which won't fault) to a kernel address.

If there were always an unmapped 'guard' page in the user address
space the access_ok() check prior to copy_to/from_user() wouldn't
need the length.
So I surmise that no such guard page exists and so the above
can advance from user addresses into kernel ones.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)