2015-02-17 10:51:27

by Omar Sandoval

[permalink] [raw]
Subject: [PATCH 0/3] btrfs: ENOMEM bugfixes

Hi,

As it turns out, running with low memory is a really easy way to shake
out undesirable behavior in Btrfs. This can be especially bad when
considering that a memory limit is really easy to hit in a container
(e.g., by using cgroup memory.limit_in_bytes). Here's a simple script
that can hit several problems:

----
#!/bin/sh

cgcreate -g memory:enomem
MEM=$((64 * 1024 * 1024))
echo $MEM > /sys/fs/cgroup/memory/enomem/memory.limit_in_bytes

cgexec -g memory:enomem ~/xfstests/ltp/fsstress -p128 -n999999999 -d /mnt/test &
trap "killall fsstress; exit 0" SIGINT SIGTERM

while true; do
cgexec -g memory:enomem python -c '
l = []
while True:
l.append(0)'
done
----

Ignoring for now the cases that drop the filesystem into read-only mode
with relatively little fuss, here are a few patches that fix some of the
low-hanging fruit. They apply to Linus' tree as of today.

Thanks!

Omar Sandoval (3):
btrfs: handle ENOMEM in btrfs_alloc_tree_block
btrfs: handle race on ENOMEM in alloc_extent_buffer
btrfs: check io_ctl_prepare_pages return in __btrfs_write_out_cache

fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++-------------
fs/btrfs/extent_io.c | 20 ++++++++++++++++----
fs/btrfs/free-space-cache.c | 10 ++++++----
3 files changed, 50 insertions(+), 21 deletions(-)

--
2.3.0


2015-02-17 10:51:34

by Omar Sandoval

[permalink] [raw]
Subject: [PATCH 1/3] btrfs: handle ENOMEM in btrfs_alloc_tree_block

This is one of the first places to go when memory is tight. Handle it
properly rather than with a BUG_ON.

Signed-off-by: Omar Sandoval <[email protected]>
---
fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a684086..479df76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7321,7 +7321,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
* returns the key for the extent through ins, and a tree buffer for
* the first block of the extent through buf.
*
- * returns the tree buffer or NULL.
+ * returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7332,6 +7332,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
+ struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
u32 blocksize = root->nodesize;
@@ -7352,14 +7353,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,

ret = btrfs_reserve_extent(root, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
- if (ret) {
- unuse_block_rsv(root->fs_info, block_rsv, blocksize);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto out_unuse;

buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
- BUG_ON(IS_ERR(buf)); /* -ENOMEM */
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_free_reserved;
+ }

if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -7369,9 +7371,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);

if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_delayed_extent_op *extent_op;
extent_op = btrfs_alloc_delayed_extent_op();
- BUG_ON(!extent_op); /* -ENOMEM */
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
@@ -7386,13 +7390,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->level = level;

ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
- ins.objectid,
- ins.offset, parent, root_objectid,
- level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
- BUG_ON(ret); /* -ENOMEM */
+ ins.objectid, ins.offset,
+ parent, root_objectid, level,
+ BTRFS_ADD_DELAYED_EXTENT,
+ extent_op, 0);
+ if (ret)
+ goto out_free_delayed;
}
return buf;
+
+out_free_delayed:
+ btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+ free_extent_buffer(buf);
+out_free_reserved:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+ return ERR_PTR(ret);
}

struct walk_control {
--
2.3.0

2015-02-17 10:52:12

by Omar Sandoval

[permalink] [raw]
Subject: [PATCH 2/3] btrfs: handle race on ENOMEM in alloc_extent_buffer

Consider the following interleaving of overlapping calls to
alloc_extent_buffer:

Call 1:

- Successfully allocates a few pages with find_or_create_page
- find_or_create_page fails, goto free_eb
- Unlocks the allocated pages

Call 2:
- Calls find_or_create_page and gets a page in call 1's extent_buffer
- Finds that the page is already associated with an extent_buffer
- Grabs a reference to the half-written extent_buffer and calls
mark_extent_buffer_accessed on it

mark_extent_buffer_accessed will then try to call mark_page_accessed on
a null page and panic.

The fix is to clear page->private of the half-written extent_buffer's
pages all at once while holding mapping->private_lock.

Signed-off-by: Omar Sandoval <[email protected]>
---
fs/btrfs/extent_io.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c73df6a..6024db9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4850,6 +4850,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
+ exists = NULL;

/*
* Do this so attach doesn't complain and we need to
@@ -4913,13 +4914,24 @@ again:
return eb;

free_eb:
+ spin_lock(&mapping->private_lock);
for (i = 0; i < num_pages; i++) {
- if (eb->pages[i])
- unlock_page(eb->pages[i]);
- }
+ struct page *page = eb->pages[i];

+ if (page) {
+ unlock_page(page);
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
+ page_cache_release(page);
+ /* One for when we alloced the page */
+ page_cache_release(page);
+ }
+ }
+ spin_unlock(&mapping->private_lock);
WARN_ON(!atomic_dec_and_test(&eb->refs));
- btrfs_release_extent_buffer(eb);
+ __free_extent_buffer(eb);
+
return exists;
}

--
2.3.0

2015-02-17 10:51:50

by Omar Sandoval

[permalink] [raw]
Subject: [PATCH 3/3] btrfs: check io_ctl_prepare_pages return in __btrfs_write_out_cache

If io_ctl_prepare_pages fails, the pages in io_ctl.pages are not valid.
When we try to access them later, things will blow up in various ways.

Signed-off-by: Omar Sandoval <[email protected]>
---
fs/btrfs/free-space-cache.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7..0460632 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1114,7 +1114,7 @@ cleanup_write_cache_enospc(struct inode *inode,
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
+ * or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
@@ -1130,11 +1130,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
int ret;

if (!i_size_read(inode))
- return -1;
+ return -EIO;

ret = io_ctl_init(&io_ctl, inode, root, 1);
if (ret)
- return -1;
+ return ret;

if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
down_write(&block_group->data_rwsem);
@@ -1151,7 +1151,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}

/* Lock all pages first so we can lock the extent safely. */
- io_ctl_prepare_pages(&io_ctl, inode, 0);
+ ret = io_ctl_prepare_pages(&io_ctl, inode, 0);
+ if (ret)
+ goto out;

lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);
--
2.3.0

2015-02-17 18:44:20

by Omar Sandoval

[permalink] [raw]
Subject: Re: [PATCH 2/3] btrfs: handle race on ENOMEM in alloc_extent_buffer

On Tue, Feb 17, 2015 at 02:51:08AM -0800, Omar Sandoval wrote:
> Consider the following interleaving of overlapping calls to
> alloc_extent_buffer:
>
> Call 1:
>
> - Successfully allocates a few pages with find_or_create_page
> - find_or_create_page fails, goto free_eb
> - Unlocks the allocated pages
>
> Call 2:
> - Calls find_or_create_page and gets a page in call 1's extent_buffer
> - Finds that the page is already associated with an extent_buffer
> - Grabs a reference to the half-written extent_buffer and calls
> mark_extent_buffer_accessed on it
>
> mark_extent_buffer_accessed will then try to call mark_page_accessed on
> a null page and panic.
>
> The fix is to clear page->private of the half-written extent_buffer's
> pages all at once while holding mapping->private_lock.
>
> Signed-off-by: Omar Sandoval <[email protected]>
> ---
> fs/btrfs/extent_io.c | 20 ++++++++++++++++----
> 1 file changed, 16 insertions(+), 4 deletions(-)
>
[snip]

Actually, I just realized that there's a simpler fix. I can resend the
whole series for easier merging once I get some review, but for now,
here's what I'm talking about:


btrfs: handle race on ENOMEM in alloc_extent_buffer

Consider the following interleaving of overlapping calls to
alloc_extent_buffer:

Call 1:

- Successfully allocates a few pages with find_or_create_page
- find_or_create_page fails, goto free_eb
- Unlocks the allocated pages

Call 2:
- Calls find_or_create_page and gets a page in call 1's extent_buffer
- Finds that the page is already associated with an extent_buffer
- Grabs a reference to the half-written extent_buffer and calls
mark_extent_buffer_accessed on it

mark_extent_buffer_accessed will then try to call mark_page_accessed on
a null page and panic.

The fix is to decrement the reference count on the half-written
extent_buffer before unlocking the pages so call 2 won't use it. We also
set exists = NULL in the case that we don't use exists to avoid
accidentally returning a freed extent_buffer in an error case.

Signed-off-by: Omar Sandoval <[email protected]>
---
fs/btrfs/extent_io.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 790dbae..6b3cd72 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4850,6 +4850,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
+ exists = NULL;

/*
* Do this so attach doesn't complain and we need to
@@ -4913,12 +4914,12 @@ again:
return eb;

free_eb:
+ WARN_ON(!atomic_dec_and_test(&eb->refs));
for (i = 0; i < num_pages; i++) {
if (eb->pages[i])
unlock_page(eb->pages[i]);
}

- WARN_ON(!atomic_dec_and_test(&eb->refs));
btrfs_release_extent_buffer(eb);
return exists;
}

--
Omar

2015-02-20 21:20:11

by Omar Sandoval

[permalink] [raw]
Subject: Re: [PATCH 0/3] btrfs: ENOMEM bugfixes

On Tue, Feb 17, 2015 at 02:51:06AM -0800, Omar Sandoval wrote:
> Hi,
>
> As it turns out, running with low memory is a really easy way to shake
> out undesirable behavior in Btrfs. This can be especially bad when
> considering that a memory limit is really easy to hit in a container
> (e.g., by using cgroup memory.limit_in_bytes). Here's a simple script
> that can hit several problems:
>
> ----
> #!/bin/sh
>
> cgcreate -g memory:enomem
> MEM=$((64 * 1024 * 1024))
> echo $MEM > /sys/fs/cgroup/memory/enomem/memory.limit_in_bytes
>
> cgexec -g memory:enomem ~/xfstests/ltp/fsstress -p128 -n999999999 -d /mnt/test &
> trap "killall fsstress; exit 0" SIGINT SIGTERM
>
> while true; do
> cgexec -g memory:enomem python -c '
> l = []
> while True:
> l.append(0)'
> done
> ----
>
> Ignoring for now the cases that drop the filesystem into read-only mode
> with relatively little fuss, here are a few patches that fix some of the
> low-hanging fruit. They apply to Linus' tree as of today.
>
So I didn't realize this until I saw Tetsuo Handa's email to the ext4
list (http://thread.gmane.org/gmane.comp.file-systems.ext4/47855), but
it looks like this behavior was exposed by a change to the kernel memory
allocator related to the too-small-to-fail allocation fiasco. To
summarize, Commit 9879de7373fc (mm: page_alloc: embed OOM killing
naturally into allocation slowpath), merged for v3.19-rc7, changed the
behavior of GFP_NOFS allocations which makes it much easier to trigger
allocation failures in filesystems.

This means that Btrfs falls over under memory pressure pretty easily
now, so it might be a good idea to follow the conversation over at
linux-mm (http://thread.gmane.org/gmane.linux.kernel.mm/126398).

These are bugs regardless of the outcome there, however, so I'd like to
see this patch series merged.

Thanks again!
> Thanks!
>
> Omar Sandoval (3):
> btrfs: handle ENOMEM in btrfs_alloc_tree_block
> btrfs: handle race on ENOMEM in alloc_extent_buffer
> btrfs: check io_ctl_prepare_pages return in __btrfs_write_out_cache
>
> fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++-------------
> fs/btrfs/extent_io.c | 20 ++++++++++++++++----
> fs/btrfs/free-space-cache.c | 10 ++++++----
> 3 files changed, 50 insertions(+), 21 deletions(-)
>
> --
> 2.3.0
>

--
Omar

2015-02-20 21:22:20

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH 0/3] btrfs: ENOMEM bugfixes

On 02/20/2015 04:20 PM, Omar Sandoval wrote:
> On Tue, Feb 17, 2015 at 02:51:06AM -0800, Omar Sandoval wrote:
>> Hi,
>>
>> As it turns out, running with low memory is a really easy way to shake
>> out undesirable behavior in Btrfs. This can be especially bad when
>> considering that a memory limit is really easy to hit in a container
>> (e.g., by using cgroup memory.limit_in_bytes). Here's a simple script
>> that can hit several problems:
>>
>> ----
>> #!/bin/sh
>>
>> cgcreate -g memory:enomem
>> MEM=$((64 * 1024 * 1024))
>> echo $MEM > /sys/fs/cgroup/memory/enomem/memory.limit_in_bytes
>>
>> cgexec -g memory:enomem ~/xfstests/ltp/fsstress -p128 -n999999999 -d /mnt/test &
>> trap "killall fsstress; exit 0" SIGINT SIGTERM
>>
>> while true; do
>> cgexec -g memory:enomem python -c '
>> l = []
>> while True:
>> l.append(0)'
>> done
>> ----
>>
>> Ignoring for now the cases that drop the filesystem into read-only mode
>> with relatively little fuss, here are a few patches that fix some of the
>> low-hanging fruit. They apply to Linus' tree as of today.
>>
> So I didn't realize this until I saw Tetsuo Handa's email to the ext4
> list (https://urldefense.proofpoint.com/v1/url?u=http://thread.gmane.org/gmane.comp.file-systems.ext4/47855&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=cKCbChRKsMpTX8ybrSkonQ%3D%3D%0A&m=nzG8bjaiVMyWylHxOvTeXimzfSNyukj4%2BAxs0AZ%2FxOI%3D%0A&s=cbd7d48f1866e79f75b88b7f94a394c53d34adfcc1a30a842382f653c978e180), but
> it looks like this behavior was exposed by a change to the kernel memory
> allocator related to the too-small-to-fail allocation fiasco. To
> summarize, Commit 9879de7373fc (mm: page_alloc: embed OOM killing
> naturally into allocation slowpath), merged for v3.19-rc7, changed the
> behavior of GFP_NOFS allocations which makes it much easier to trigger
> allocation failures in filesystems.
>
> This means that Btrfs falls over under memory pressure pretty easily
> now, so it might be a good idea to follow the conversation over at
> linux-mm (https://urldefense.proofpoint.com/v1/url?u=http://thread.gmane.org/gmane.linux.kernel.mm/126398&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=cKCbChRKsMpTX8ybrSkonQ%3D%3D%0A&m=nzG8bjaiVMyWylHxOvTeXimzfSNyukj4%2BAxs0AZ%2FxOI%3D%0A&s=5177c5ceb03f82d8abb0beeeb4dc5e0c45cc77e9687881590e3ef1701f069a85).
>
> These are bugs regardless of the outcome there, however, so I'd like to
> see this patch series merged.
>

Yeah I'm fine with this, your stuff fixes actual problems and they look
sane so I'm cool with taking them. Regardless of what the mm guys do we
shouldn't fall over horribly when allocations fail. Thanks,

Josef

2015-02-22 14:59:16

by Liu Bo

[permalink] [raw]
Subject: Re: [PATCH 3/3] btrfs: check io_ctl_prepare_pages return in __btrfs_write_out_cache

On Tue, Feb 17, 2015 at 02:51:09AM -0800, Omar Sandoval wrote:
> If io_ctl_prepare_pages fails, the pages in io_ctl.pages are not valid.
> When we try to access them later, things will blow up in various ways.
>

Looks good.

Reviewed-by: Liu Bo <[email protected]>

> Signed-off-by: Omar Sandoval <[email protected]>
> ---
> fs/btrfs/free-space-cache.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index d6c03f7..0460632 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -1114,7 +1114,7 @@ cleanup_write_cache_enospc(struct inode *inode,
> *
> * This function writes out a free space cache struct to disk for quick recovery
> * on mount. This will return 0 if it was successfull in writing the cache out,
> - * and -1 if it was not.
> + * or an errno if it was not.
> */
> static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
> struct btrfs_free_space_ctl *ctl,
> @@ -1130,11 +1130,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
> int ret;
>
> if (!i_size_read(inode))
> - return -1;
> + return -EIO;
>
> ret = io_ctl_init(&io_ctl, inode, root, 1);
> if (ret)
> - return -1;
> + return ret;
>
> if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
> down_write(&block_group->data_rwsem);
> @@ -1151,7 +1151,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
> }
>
> /* Lock all pages first so we can lock the extent safely. */
> - io_ctl_prepare_pages(&io_ctl, inode, 0);
> + ret = io_ctl_prepare_pages(&io_ctl, inode, 0);
> + if (ret)
> + goto out;
>
> lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
> 0, &cached_state);
> --
> 2.3.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Thanks,

-liubo