2021-06-02 16:32:19

by Jarmo Tiitto

[permalink] [raw]
Subject: [PATCH 1/1] pgo: Fix sleep in atomic section in prf_open()

In prf_open() the required buffer size can be so large that
vzalloc() may sleep thus triggering bug:

======
BUG: sleeping function called from invalid context at include/linux/sched/mm.h:201
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 337, name: cat
CPU: 1 PID: 337 Comm: cat Not tainted 5.13.0-rc2-24-hack+ #154
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
Call Trace:
dump_stack+0xc7/0x134
___might_sleep+0x177/0x190
__might_sleep+0x5a/0x90
kmem_cache_alloc_node_trace+0x6b/0x3a0
? __get_vm_area_node+0xcd/0x1b0
? dput+0x283/0x300
__get_vm_area_node+0xcd/0x1b0
__vmalloc_node_range+0x7b/0x420
? prf_open+0x1da/0x580
? prf_open+0x32/0x580
? __llvm_profile_instrument_memop+0x36/0x50
vzalloc+0x54/0x60
? prf_open+0x1da/0x580
prf_open+0x1da/0x580
full_proxy_open+0x211/0x370
....
======

This patch avoids holding the prf_lock() while calling
vzalloc(). Problem with that is prf_buffer_size()
*must* be called with prf_lock() held and the buffer
size may change while we call vzalloc()

So first get buffer size, release the lock and allocate.
Then re-lock and call prf_serialize() that now checks if
the buffer is big enough. If not, the code loops.

Signed-off-by: Jarmo Tiitto <[email protected]>
---
kernel/pgo/fs.c | 45 +++++++++++++++++++++++++++++++++++----------
1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/kernel/pgo/fs.c b/kernel/pgo/fs.c
index ef985159dad3..e8ac07637423 100644
--- a/kernel/pgo/fs.c
+++ b/kernel/pgo/fs.c
@@ -227,16 +227,15 @@ static unsigned long prf_buffer_size(void)
* Serialize the profiling data into a format LLVM's tools can understand.
* Note: caller *must* hold pgo_lock.
*/
-static int prf_serialize(struct prf_private_data *p)
+static int prf_serialize(struct prf_private_data *p, unsigned long *buf_size)
{
int err = 0;
void *buffer;

- p->size = prf_buffer_size();
- p->buffer = vzalloc(p->size);
+ *buf_size = prf_buffer_size();

- if (!p->buffer) {
- err = -ENOMEM;
+ if (p->size < *bufsize) {
+ err = -EAGAIN;
goto out;
}

@@ -259,6 +258,7 @@ static int prf_open(struct inode *inode, struct file *file)
{
struct prf_private_data *data;
unsigned long flags;
+ unsigned long buf_size;
int err;

data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -267,14 +267,39 @@ static int prf_open(struct inode *inode, struct file *file)
goto out;
}

+ /* note: vzalloc() can be used in atomic section.
+ * However to get the buffer size prf_lock() *must*
+ * be taken. So take lock, get buffer size, release
+ * the lock and allocate.
+ * prf_serialize() then checks if buffer has enough space.
+ */
flags = prf_lock();
+ buf_size = prf_buffer_size();

- err = prf_serialize(data);
- if (unlikely(err)) {
- kfree(data);
- goto out_unlock;
- }
+ do {
+ prf_unlock(flags);
+
+ /* resize buffer */
+ if (data->size < buf_size && data->buffer) {
+ vfree(data->buffer);
+ data->buffer = NULL;
+ }
+
+ if (!data->buffer) {
+ data->size = buf_size;
+ data->buffer = vzalloc(data->size);
+
+ if (!data->buffer) {
+ err = -ENOMEM;
+ kfree(data);
+ goto out;
+ }
+ }
+ /* try serialize */
+ flags = prf_lock();
+ } while (prf_serialize(data, &buf_size));

+ data->size = buf_size;
file->private_data = data;

out_unlock:

base-commit: e1af496cbe9b4517428601a4e44fee3602dd3c15
prerequisite-patch-id: fccc1bd89bbd33af13a4ce9bc3c913e6e3cdecee
prerequisite-patch-id: a2e53c0b44ad39c78ed7bc7aad40d133548a13b5
prerequisite-patch-id: 12f0e468a3d0ff12c7f5bc640f213be3b5dd261b
prerequisite-patch-id: 707b836b1969958b5131dfa1b9f044eae5f4a76a
--
2.31.1


2021-06-02 17:35:53

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH 1/1] pgo: Fix sleep in atomic section in prf_open()

On Wed, Jun 02, 2021 at 07:26:40PM +0300, Jarmo Tiitto wrote:
> In prf_open() the required buffer size can be so large that
> vzalloc() may sleep thus triggering bug:
>
> ======
> BUG: sleeping function called from invalid context at include/linux/sched/mm.h:201
> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 337, name: cat
> CPU: 1 PID: 337 Comm: cat Not tainted 5.13.0-rc2-24-hack+ #154
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> Call Trace:
> dump_stack+0xc7/0x134
> ___might_sleep+0x177/0x190
> __might_sleep+0x5a/0x90
> kmem_cache_alloc_node_trace+0x6b/0x3a0
> ? __get_vm_area_node+0xcd/0x1b0
> ? dput+0x283/0x300
> __get_vm_area_node+0xcd/0x1b0
> __vmalloc_node_range+0x7b/0x420
> ? prf_open+0x1da/0x580
> ? prf_open+0x32/0x580
> ? __llvm_profile_instrument_memop+0x36/0x50
> vzalloc+0x54/0x60
> ? prf_open+0x1da/0x580
> prf_open+0x1da/0x580
> full_proxy_open+0x211/0x370
> ....
> ======

Ah-ha; nice catch!

>
> This patch avoids holding the prf_lock() while calling
> vzalloc(). Problem with that is prf_buffer_size()
> *must* be called with prf_lock() held and the buffer
> size may change while we call vzalloc()
>
> So first get buffer size, release the lock and allocate.
> Then re-lock and call prf_serialize() that now checks if
> the buffer is big enough. If not, the code loops.
>
> Signed-off-by: Jarmo Tiitto <[email protected]>
> ---
> kernel/pgo/fs.c | 45 +++++++++++++++++++++++++++++++++++----------
> 1 file changed, 35 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/pgo/fs.c b/kernel/pgo/fs.c
> index ef985159dad3..e8ac07637423 100644
> --- a/kernel/pgo/fs.c
> +++ b/kernel/pgo/fs.c
> @@ -227,16 +227,15 @@ static unsigned long prf_buffer_size(void)
> * Serialize the profiling data into a format LLVM's tools can understand.
> * Note: caller *must* hold pgo_lock.
> */
> -static int prf_serialize(struct prf_private_data *p)
> +static int prf_serialize(struct prf_private_data *p, unsigned long *buf_size)
> {
> int err = 0;
> void *buffer;
>
> - p->size = prf_buffer_size();
> - p->buffer = vzalloc(p->size);
> + *buf_size = prf_buffer_size();
>
> - if (!p->buffer) {
> - err = -ENOMEM;
> + if (p->size < *bufsize) {

Nit: please change prf_private_data::size to size_t while you're
touching this code.

> + err = -EAGAIN;
> goto out;
> }
>
> @@ -259,6 +258,7 @@ static int prf_open(struct inode *inode, struct file *file)
> {
> struct prf_private_data *data;
> unsigned long flags;
> + unsigned long buf_size;
> int err;
>
> data = kzalloc(sizeof(*data), GFP_KERNEL);
> @@ -267,14 +267,39 @@ static int prf_open(struct inode *inode, struct file *file)
> goto out;
> }
>
> + /* note: vzalloc() can be used in atomic section.
> + * However to get the buffer size prf_lock() *must*
> + * be taken. So take lock, get buffer size, release
> + * the lock and allocate.
> + * prf_serialize() then checks if buffer has enough space.
> + */
> flags = prf_lock();
> + buf_size = prf_buffer_size();
>
> - err = prf_serialize(data);
> - if (unlikely(err)) {
> - kfree(data);
> - goto out_unlock;
> - }
> + do {
> + prf_unlock(flags);
> +
> + /* resize buffer */
> + if (data->size < buf_size && data->buffer) {
> + vfree(data->buffer);
> + data->buffer = NULL;
> + }
> +
> + if (!data->buffer) {
> + data->size = buf_size;
> + data->buffer = vzalloc(data->size);
> +
> + if (!data->buffer) {
> + err = -ENOMEM;
> + kfree(data);
> + goto out;
> + }
> + }
> + /* try serialize */
> + flags = prf_lock();
> + } while (prf_serialize(data, &buf_size));

I'm not a fan of loops where it's hard to answer the question "how do we
know this loop will always terminate?"

Given that vmalloc allocates PAGE_SIZE-granular regions, how about
rounding up to likely avoid multiple passes and put the growth explicitly
in the loop, rather than just looking at "any" prf_serialize() failure.

e.g.:

struct prf_private_data *data;
int err = -ENOMEM;

data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
goto out_free;

do {
unsigned long flags;
size_t size;

size = PAGE_ALIGN(prf_buffer_size());
/* Required buffer size must be growing with each loop. */
if (WARN_ON_ONCE(size <= data->size)) {
err = -ENOMEM;
goto out_free;
}

if (data->buf)
vfree(data->buf);
data->buf = vzalloc(size);
if (!data->buf) {
err = -ENOMEM;
goto out_free;
}
data->size = size;

flags = prf_lock();
err = prf_serialize(data);
prf_unlock(flags);
} while (err == -EAGAIN);

if (err)
goto out_free;

file->private_data = data;
return 0;

out_free:
if (data)
vfree(data->buf);
kfree(data);
return err;


>
> + data->size = buf_size;
> file->private_data = data;
>
> out_unlock:
>
> base-commit: e1af496cbe9b4517428601a4e44fee3602dd3c15
> prerequisite-patch-id: fccc1bd89bbd33af13a4ce9bc3c913e6e3cdecee
> prerequisite-patch-id: a2e53c0b44ad39c78ed7bc7aad40d133548a13b5
> prerequisite-patch-id: 12f0e468a3d0ff12c7f5bc640f213be3b5dd261b
> prerequisite-patch-id: 707b836b1969958b5131dfa1b9f044eae5f4a76a
> --
> 2.31.1
>

--
Kees Cook