2009-12-02 01:14:07

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 1/2] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer()

OOM happens.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/jbd2/commit.c | 4 ++++
fs/jbd2/journal.c | 4 ++++
2 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d..8896c1d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &new_jh, blocknr);
+ if (flags < 0) {
+ jbd2_journal_abort(journal, flags);
+ continue;
+ }
set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
wbuf[bufs++] = jh2bh(new_jh);

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index af60d98..3f473fa 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -358,6 +358,10 @@ repeat:

jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+ if (!tmp) {
+ jbd2_journal_put_journal_head(new_jh);
+ return -ENOMEM;
+ }
jbd_lock_bh_state(bh_in);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
--
1.6.5.216.g5288a.dirty



2009-12-02 01:14:07

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations

Now that the SLUB seems to be fixed so that it respects the requested
alignment, use kmem_cache_alloc() to allocator if the block size of
the buffer heads to be allocated is less than the page size.
Previously, we were using 16k page on a Power system for each buffer,
even when the file system was using 1k or 4k block size.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/jbd2/journal.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/jbd2.h | 11 +----
2 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3f473fa..8e0c0a9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);

static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);

/*
* Helper function used to manage commit timeouts
@@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
}
}

+ /*
+ * Create a slab for this blocksize
+ */
+ err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+ if (err)
+ return err;
+
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
if (jbd2_journal_recover(journal))
@@ -1806,6 +1816,111 @@ size_t journal_tag_bytes(journal_t *journal)
}

/*
+ * JBD memory management
+ */
+
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+ "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+ "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < JBD2_MAX_SLABS; i++) {
+ if (jbd2_slab[i])
+ kmem_cache_destroy(jbd2_slab[i]);
+ jbd2_slab[i] = NULL;
+ }
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+ int i = order_base_2(size) - 10;
+ size_t slab_size;
+
+ if (size == PAGE_SIZE)
+ return 0;
+
+ if (i >= JBD2_MAX_SLABS)
+ return -EINVAL;
+
+ if (unlikely(i < 0))
+ i = 0;
+ if (jbd2_slab[i])
+ return 0; /* Already created */
+
+ slab_size = 1 << (i+10);
+ jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+ slab_size, 0, NULL);
+ if (!jbd2_slab[i]) {
+ printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+ int i = order_base_2(size) - 10;
+
+ BUG_ON(i >= JBD2_MAX_SLABS);
+ if (unlikely(i < 0))
+ i = 0;
+ BUG_ON(jbd2_slab[i] == 0);
+ return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+ void *ptr;
+
+ BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+ flags |= __GFP_REPEAT;
+ if (size == PAGE_SIZE)
+ ptr = (void *)__get_free_pages(flags, 0);
+ else if (size > PAGE_SIZE) {
+ int order = get_order(size);
+
+ if (order < 3)
+ ptr = (void *)__get_free_pages(flags, order);
+ else
+ ptr = vmalloc(size);
+ } else
+ ptr = kmem_cache_alloc(get_slab(size), flags);
+
+ /* Check alignment; SLUB has gotten this wrong in the past,
+ * and this can lead to user data corruption! */
+ BUG_ON(((unsigned long) ptr) & (size-1));
+
+ return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+ if (size == PAGE_SIZE) {
+ free_pages((unsigned long)ptr, 0);
+ return;
+ }
+ if (size > PAGE_SIZE) {
+ int order = get_order(size);
+
+ if (order < 3)
+ free_pages((unsigned long)ptr, order);
+ else
+ vfree(ptr);
+ return;
+ }
+ kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
* Journal_head storage management
*/
static struct kmem_cache *jbd2_journal_head_cache;
@@ -2202,6 +2317,7 @@ static void jbd2_journal_destroy_caches(void)
jbd2_journal_destroy_revoke_caches();
jbd2_journal_destroy_jbd2_journal_head_cache();
jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_slabs();
}

static int __init journal_init(void)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f1011f7..a7a8c7a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
#define jbd_debug(f, a...) /**/
#endif

-static inline void *jbd2_alloc(size_t size, gfp_t flags)
-{
- return (void *)__get_free_pages(flags, get_order(size));
-}

2009-12-02 16:48:03

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations

On Tue, Dec 01, 2009 at 08:14:11PM -0500, Theodore Ts'o wrote:
> Now that the SLUB seems to be fixed so that it respects the requested
> alignment, use kmem_cache_alloc() to allocator if the block size of
> the buffer heads to be allocated is less than the page size.
> Previously, we were using 16k page on a Power system for each buffer,
> even when the file system was using 1k or 4k block size.
>
> Signed-off-by: "Theodore Ts'o" <[email protected]>

Heh funny you should bring this up, we just fixed a problem in RHEL5 related to
using slabs. If you have a non-jbd based root filesystem and then mount a bunch
of jbd-based fs's at the same time you will race with the cache creation stuff
and end up getting errors about creating the same slab cache multiple times.
The way this was fixed in RHEL5 was to put a mutex around the slabcache creation
and deletion so this sort of race. It seems that this patch has the same issue
as the 2.6.18 jbd had. Thanks,

Josef

2009-12-02 16:53:27

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations

Theodore Ts'o wrote:
> Now that the SLUB seems to be fixed so that it respects the requested
> alignment, use kmem_cache_alloc() to allocator if the block size of
> the buffer heads to be allocated is less than the page size.
> Previously, we were using 16k page on a Power system for each buffer,
> even when the file system was using 1k or 4k block size.

So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
more or less, right:

JBD: Replace slab allocations with page allocations

Author: Mingming Cao <[email protected]>

JBD allocate memory for committed_data and frozen_data from slab. However
JBD should not pass slab pages down to the block layer.
Use page allocator pages instead. This will also prepare JBD for the large blocksize patchset.

Was alignment the only reason that commit went in?


Also, there is a race issue here I think, that we've had bugs
on internally, but w/ the above commit it didn't matter anymore:

If you simultaneously mount several jbd(2)-based filesystems
at once, before the slabs get created, nothing protects jbd2_slab[]
and we can race on creation/initialization of that array.

See also https://bugzilla.lustre.org/show_bug.cgi?id=19184

and a proposed patch:

https://bugzilla.lustre.org/attachment.cgi?id=22906

but I think that patch is a little heavy-handed, the down_reads
of the slab lock seem to be extraneous because if the fs is mounted,
nobody will be tearing down that slab anyway, since that only
happens on module unload, right?

-Eric

> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
> fs/jbd2/journal.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/jbd2.h | 11 +----
> 2 files changed, 118 insertions(+), 9 deletions(-)
>
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 3f473fa..8e0c0a9 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -39,6 +39,8 @@
> #include <linux/seq_file.h>
> #include <linux/math64.h>
> #include <linux/hash.h>
> +#include <linux/log2.h>
> +#include <linux/vmalloc.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/jbd2.h>
> @@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
>
> static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
> static void __journal_abort_soft (journal_t *journal, int errno);
> +static int jbd2_journal_create_slab(size_t slab_size);
>
> /*
> * Helper function used to manage commit timeouts
> @@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
> }
> }
>
> + /*
> + * Create a slab for this blocksize
> + */
> + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
> + if (err)
> + return err;
> +
> /* Let the recovery code check whether it needs to recover any
> * data from the journal. */
> if (jbd2_journal_recover(journal))
> @@ -1806,6 +1816,111 @@ size_t journal_tag_bytes(journal_t *journal)
> }
>
> /*
> + * JBD memory management
> + */
> +
> +#define JBD2_MAX_SLABS 8
> +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
> +
> +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
> + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
> + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
> +};
> +
> +
> +static void jbd2_journal_destroy_slabs(void)
> +{
> + int i;
> +
> + for (i = 0; i < JBD2_MAX_SLABS; i++) {
> + if (jbd2_slab[i])
> + kmem_cache_destroy(jbd2_slab[i]);
> + jbd2_slab[i] = NULL;
> + }
> +}
> +
> +static int jbd2_journal_create_slab(size_t size)
> +{
> + int i = order_base_2(size) - 10;
> + size_t slab_size;
> +
> + if (size == PAGE_SIZE)
> + return 0;
> +
> + if (i >= JBD2_MAX_SLABS)
> + return -EINVAL;
> +
> + if (unlikely(i < 0))
> + i = 0;
> + if (jbd2_slab[i])
> + return 0; /* Already created */
> +
> + slab_size = 1 << (i+10);
> + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
> + slab_size, 0, NULL);
> + if (!jbd2_slab[i]) {
> + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
> + return -ENOMEM;
> + }
> + return 0;
> +}
> +
> +static struct kmem_cache *get_slab(size_t size)
> +{
> + int i = order_base_2(size) - 10;
> +
> + BUG_ON(i >= JBD2_MAX_SLABS);
> + if (unlikely(i < 0))
> + i = 0;
> + BUG_ON(jbd2_slab[i] == 0);
> + return jbd2_slab[i];
> +}
> +
> +void *jbd2_alloc(size_t size, gfp_t flags)
> +{
> + void *ptr;
> +
> + BUG_ON(size & (size-1)); /* Must be a power of 2 */
> +
> + flags |= __GFP_REPEAT;
> + if (size == PAGE_SIZE)
> + ptr = (void *)__get_free_pages(flags, 0);
> + else if (size > PAGE_SIZE) {
> + int order = get_order(size);
> +
> + if (order < 3)
> + ptr = (void *)__get_free_pages(flags, order);
> + else
> + ptr = vmalloc(size);
> + } else
> + ptr = kmem_cache_alloc(get_slab(size), flags);
> +
> + /* Check alignment; SLUB has gotten this wrong in the past,
> + * and this can lead to user data corruption! */
> + BUG_ON(((unsigned long) ptr) & (size-1));
> +
> + return ptr;
> +}
> +
> +void jbd2_free(void *ptr, size_t size)
> +{
> + if (size == PAGE_SIZE) {
> + free_pages((unsigned long)ptr, 0);
> + return;
> + }
> + if (size > PAGE_SIZE) {
> + int order = get_order(size);
> +
> + if (order < 3)
> + free_pages((unsigned long)ptr, order);
> + else
> + vfree(ptr);
> + return;
> + }
> + kmem_cache_free(get_slab(size), ptr);
> +};
> +
> +/*
> * Journal_head storage management
> */
> static struct kmem_cache *jbd2_journal_head_cache;
> @@ -2202,6 +2317,7 @@ static void jbd2_journal_destroy_caches(void)
> jbd2_journal_destroy_revoke_caches();
> jbd2_journal_destroy_jbd2_journal_head_cache();
> jbd2_journal_destroy_handle_cache();
> + jbd2_journal_destroy_slabs();
> }
>
> static int __init journal_init(void)
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index f1011f7..a7a8c7a 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
> #define jbd_debug(f, a...) /**/
> #endif
>
> -static inline void *jbd2_alloc(size_t size, gfp_t flags)
> -{
> - return (void *)__get_free_pages(flags, get_order(size));
> -}
> -
> -static inline void jbd2_free(void *ptr, size_t size)
> -{
> - free_pages((unsigned long)ptr, get_order(size));
> -};
> +extern void *jbd2_alloc(size_t size, gfp_t flags);
> +extern void jbd2_free(void *ptr, size_t size);
>
> #define JBD2_MIN_JOURNAL_BLOCKS 1024
>


2009-12-02 19:32:06

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations

On 2009-12-02, at 09:53, Eric Sandeen wrote:
> So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
> more or less, right:
>
> JBD: Replace slab allocations with page allocations
>
> Author: Mingming Cao <[email protected]>
>
> JBD allocate memory for committed_data and frozen_data from slab.
> However JBD should not pass slab pages down to the block layer.
> Use page allocator pages instead. This will also prepare JBD for the
> large blocksize patchset.
>
> Was alignment the only reason that commit went in?

The reason had to do with the block layer, but I don't recall if
alignment was the only reason for it.

> Also, there is a race issue here I think, that we've had bugs
> on internally, but w/ the above commit it didn't matter anymore:
>
> If you simultaneously mount several jbd(2)-based filesystems
> at once, before the slabs get created, nothing protects jbd2_slab[]
> and we can race on creation/initialization of that array.
>
> See also https://bugzilla.lustre.org/show_bug.cgi?id=19184
>
> and a proposed patch:
>
> https://bugzilla.lustre.org/attachment.cgi?id=22906
>
> but I think that patch is a little heavy-handed, the down_reads
> of the slab lock seem to be extraneous because if the fs is mounted,
> nobody will be tearing down that slab anyway, since that only
> happens on module unload, right?

Right, the slabs should only be created once, but they can never be
destroyed while in use, so the rest of the locking is superfluous,
though it at least deserves to be replaced by a comment indicating why
no locking is needed for it.

>> Signed-off-by: "Theodore Ts'o" <[email protected]>
>> ---
>> fs/jbd2/journal.c | 116 ++++++++++++++++++++++++++++++++++++++++
>> ++++++++++
>> include/linux/jbd2.h | 11 +----
>> 2 files changed, 118 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
>> index 3f473fa..8e0c0a9 100644
>> --- a/fs/jbd2/journal.c
>> +++ b/fs/jbd2/journal.c
>> @@ -39,6 +39,8 @@
>> #include <linux/seq_file.h>
>> #include <linux/math64.h>
>> #include <linux/hash.h>
>> +#include <linux/log2.h>
>> +#include <linux/vmalloc.h>
>>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/jbd2.h>
>> @@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
>>
>> static int journal_convert_superblock_v1(journal_t *,
>> journal_superblock_t *);
>> static void __journal_abort_soft (journal_t *journal, int errno);
>> +static int jbd2_journal_create_slab(size_t slab_size);
>>
>> /*
>> * Helper function used to manage commit timeouts
>> @@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
>> }
>> }
>>
>> + /*
>> + * Create a slab for this blocksize
>> + */
>> + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
>> + if (err)
>> + return err;
>> +
>> /* Let the recovery code check whether it needs to recover any
>> * data from the journal. */
>> if (jbd2_journal_recover(journal))
>> @@ -1806,6 +1816,111 @@ size_t journal_tag_bytes(journal_t *journal)
>> }
>>
>> /*
>> + * JBD memory management
>> + */
>> +
>> +#define JBD2_MAX_SLABS 8
>> +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
>> +
>> +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
>> + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
>> + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
>> +};
>> +
>> +
>> +static void jbd2_journal_destroy_slabs(void)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < JBD2_MAX_SLABS; i++) {
>> + if (jbd2_slab[i])
>> + kmem_cache_destroy(jbd2_slab[i]);
>> + jbd2_slab[i] = NULL;
>> + }
>> +}
>> +
>> +static int jbd2_journal_create_slab(size_t size)
>> +{
>> + int i = order_base_2(size) - 10;
>> + size_t slab_size;
>> +
>> + if (size == PAGE_SIZE)
>> + return 0;
>> +
>> + if (i >= JBD2_MAX_SLABS)
>> + return -EINVAL;
>> +
>> + if (unlikely(i < 0))
>> + i = 0;
>> + if (jbd2_slab[i])
>> + return 0; /* Already created */
>> +
>> + slab_size = 1 << (i+10);
>> + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
>> + slab_size, 0, NULL);
>> + if (!jbd2_slab[i]) {
>> + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
>> + return -ENOMEM;
>> + }
>> + return 0;
>> +}
>> +
>> +static struct kmem_cache *get_slab(size_t size)
>> +{
>> + int i = order_base_2(size) - 10;
>> +
>> + BUG_ON(i >= JBD2_MAX_SLABS);
>> + if (unlikely(i < 0))
>> + i = 0;
>> + BUG_ON(jbd2_slab[i] == 0);
>> + return jbd2_slab[i];
>> +}
>> +
>> +void *jbd2_alloc(size_t size, gfp_t flags)
>> +{
>> + void *ptr;
>> +
>> + BUG_ON(size & (size-1)); /* Must be a power of 2 */
>> +
>> + flags |= __GFP_REPEAT;
>> + if (size == PAGE_SIZE)
>> + ptr = (void *)__get_free_pages(flags, 0);
>> + else if (size > PAGE_SIZE) {
>> + int order = get_order(size);
>> +
>> + if (order < 3)
>> + ptr = (void *)__get_free_pages(flags, order);
>> + else
>> + ptr = vmalloc(size);
>> + } else
>> + ptr = kmem_cache_alloc(get_slab(size), flags);
>> +
>> + /* Check alignment; SLUB has gotten this wrong in the past,
>> + * and this can lead to user data corruption! */
>> + BUG_ON(((unsigned long) ptr) & (size-1));
>> +
>> + return ptr;
>> +}
>> +
>> +void jbd2_free(void *ptr, size_t size)
>> +{
>> + if (size == PAGE_SIZE) {
>> + free_pages((unsigned long)ptr, 0);
>> + return;
>> + }
>> + if (size > PAGE_SIZE) {
>> + int order = get_order(size);
>> +
>> + if (order < 3)
>> + free_pages((unsigned long)ptr, order);
>> + else
>> + vfree(ptr);
>> + return;
>> + }
>> + kmem_cache_free(get_slab(size), ptr);
>> +};
>> +
>> +/*
>> * Journal_head storage management
>> */
>> static struct kmem_cache *jbd2_journal_head_cache;
>> @@ -2202,6 +2317,7 @@ static void jbd2_journal_destroy_caches(void)
>> jbd2_journal_destroy_revoke_caches();
>> jbd2_journal_destroy_jbd2_journal_head_cache();
>> jbd2_journal_destroy_handle_cache();
>> + jbd2_journal_destroy_slabs();
>> }
>>
>> static int __init journal_init(void)
>> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
>> index f1011f7..a7a8c7a 100644
>> --- a/include/linux/jbd2.h
>> +++ b/include/linux/jbd2.h
>> @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
>> #define jbd_debug(f, a...) /**/
>> #endif
>>
>> -static inline void *jbd2_alloc(size_t size, gfp_t flags)
>> -{
>> - return (void *)__get_free_pages(flags, get_order(size));
>> -}
>> -
>> -static inline void jbd2_free(void *ptr, size_t size)
>> -{
>> - free_pages((unsigned long)ptr, get_order(size));
>> -};
>> +extern void *jbd2_alloc(size_t size, gfp_t flags);
>> +extern void jbd2_free(void *ptr, size_t size);
>>
>> #define JBD2_MIN_JOURNAL_BLOCKS 1024
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-
> ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2009-12-02 20:12:30

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations

On Wed, Dec 02, 2009 at 10:53:26AM -0600, Eric Sandeen wrote:
>
> So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
> more or less, right:
>
> JBD: Replace slab allocations with page allocations

Close, but not quite. We still use getfreepage() in the case where
blocksize==pagesize. So in the most common case of 4k blocks on
x86/x86_64, we won't be sufferring the overhead of using the
SLAB/SLUB/SLOB/SLQB machinery.

> Was alignment the only reason that commit went in?

Alignment and if debugging was enabled, it meant that you needed two
contiguous pages for every 4k shadow buffer_head used by the JBD
layer, not to mention the overhead of tracking each page in the slab
cache.

> Also, there is a race issue here I think, that we've had bugs
> on internally, but w/ the above commit it didn't matter anymore:
>
> If you simultaneously mount several jbd(2)-based filesystems
> at once, before the slabs get created, nothing protects jbd2_slab[]
> and we can race on creation/initialization of that array.
>
> See a proposed patch:
>
> https://bugzilla.lustre.org/attachment.cgi?id=22906
>
> but I think that patch is a little heavy-handed, the down_reads
> of the slab lock seem to be extraneous because if the fs is mounted,
> nobody will be tearing down that slab anyway, since that only
> happens on module unload, right?

Yeah, we only need the lock at mount time, when we are filling in the
jbd_slab[] array. Thanks for pointing that out; I'll fix it and
resend the patch.

- Ted

2009-12-04 02:43:32

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH -v2] ext4: Use slab allocator for sub-page sized allocations

Now that the SLUB seems to be fixed so that it respects the requested
alignment, use kmem_cache_alloc() to allocator if the block size of
the buffer heads to be allocated is less than the page size.
Previously, we were using 16k page on a Power system for each buffer,
even when the file system was using 1k or 4k block size.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
Address the concerns found in https://bugzilla.lustre.org/show_bug.cgi?id=19184

fs/jbd2/journal.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/jbd2.h | 11 +---
2 files changed, 134 insertions(+), 9 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3f473fa..5824353 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);

static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);

/*
* Helper function used to manage commit timeouts
@@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
}
}

+ /*
+ * Create a slab for this blocksize
+ */
+ err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+ if (err)
+ return err;
+
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
if (jbd2_journal_recover(journal))
@@ -1806,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal)
}

/*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data. Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks. (For example, 16k pages on Power systems
+ * with a 4k block file system.) For blocks smaller than a page, we
+ * use a SLAB allocator. There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded. For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+ "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+ "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < JBD2_MAX_SLABS; i++) {
+ if (jbd2_slab[i])
+ kmem_cache_destroy(jbd2_slab[i]);
+ jbd2_slab[i] = NULL;
+ }
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+ int i = order_base_2(size) - 10;
+ size_t slab_size;
+
+ if (size == PAGE_SIZE)
+ return 0;
+
+ if (i >= JBD2_MAX_SLABS)
+ return -EINVAL;
+
+ if (unlikely(i < 0))
+ i = 0;
+ down(&jbd2_slab_create_sem);
+ if (jbd2_slab[i]) {
+ up(&jbd2_slab_create_sem);
+ return 0; /* Already created */
+ }
+
+ slab_size = 1 << (i+10);
+ jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+ slab_size, 0, NULL);
+ up(&jbd2_slab_create_sem);
+ if (!jbd2_slab[i]) {
+ printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+ int i = order_base_2(size) - 10;
+
+ BUG_ON(i >= JBD2_MAX_SLABS);
+ if (unlikely(i < 0))
+ i = 0;
+ BUG_ON(jbd2_slab[i] == 0);
+ return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+ void *ptr;
+
+ BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+ flags |= __GFP_REPEAT;
+ if (size == PAGE_SIZE)
+ ptr = (void *)__get_free_pages(flags, 0);
+ else if (size > PAGE_SIZE) {
+ int order = get_order(size);
+
+ if (order < 3)
+ ptr = (void *)__get_free_pages(flags, order);
+ else
+ ptr = vmalloc(size);
+ } else
+ ptr = kmem_cache_alloc(get_slab(size), flags);
+
+ /* Check alignment; SLUB has gotten this wrong in the past,
+ * and this can lead to user data corruption! */
+ BUG_ON(((unsigned long) ptr) & (size-1));
+
+ return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+ if (size == PAGE_SIZE) {
+ free_pages((unsigned long)ptr, 0);
+ return;
+ }
+ if (size > PAGE_SIZE) {
+ int order = get_order(size);
+
+ if (order < 3)
+ free_pages((unsigned long)ptr, order);
+ else
+ vfree(ptr);
+ return;
+ }
+ kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
* Journal_head storage management
*/
static struct kmem_cache *jbd2_journal_head_cache;
@@ -2202,6 +2333,7 @@ static void jbd2_journal_destroy_caches(void)
jbd2_journal_destroy_revoke_caches();
jbd2_journal_destroy_jbd2_journal_head_cache();
jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_slabs();
}

static int __init journal_init(void)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f1011f7..a7a8c7a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
#define jbd_debug(f, a...) /**/
#endif

-static inline void *jbd2_alloc(size_t size, gfp_t flags)
-{
- return (void *)__get_free_pages(flags, get_order(size));
-}

2009-12-05 06:43:17

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations

Eric Sandeen wrote:
> Theodore Ts'o wrote:
>> Now that the SLUB seems to be fixed so that it respects the requested
>> alignment, use kmem_cache_alloc() to allocator if the block size of
>> the buffer heads to be allocated is less than the page size.
>> Previously, we were using 16k page on a Power system for each buffer,
>> even when the file system was using 1k or 4k block size.
>
> So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
> more or less, right:
>
> JBD: Replace slab allocations with page allocations
>
> Author: Mingming Cao <[email protected]>
>
> JBD allocate memory for committed_data and frozen_data from slab. However
> JBD should not pass slab pages down to the block layer.
> Use page allocator pages instead. This will also prepare JBD for the large blocksize patchset.
>
> Was alignment the only reason that commit went in?

Actually, Christoph reminded me that iscsi & co will not like this.

See commit 1fa40b01ae4d1b00e366d4949edcc230f5cd6d99 for xfs moving
in the opposite direction back in 2007...

-Eric