2006-08-23 23:05:12

by Badari Pulavarty

[permalink] [raw]
Subject: [RFC][PATCH] Manage jbd allocations from its own slabs

Hi,

Here is the fix to "bh: Ensure bh fits within a page" problem
caused by JBD.

BTW, I realized that this problem can happen only with 1k, 2k
filesystems - as 4k, 8k allocations disable slab debug
automatically. But for completeness, I created slabs for those
also.

What do you think ? I ran basic tests and things are fine.

Thanks,
Badari

JBD currently allocates commit and frozen buffers from slabs.
With CONFIG_SLAB_DEBUG, its possible for an allocation to
cross the page boundary causing IO problems.

https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=200127

So, instead of allocating these from regular slabs - manage
allocation from its own slabs and disable slab debug for these slabs.

Signed-off-by: Badari Pulavarty <[email protected]>
---
fs/jbd/commit.c | 6 +--
fs/jbd/journal.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++---
fs/jbd/transaction.c | 9 ++---
include/linux/jbd.h | 3 +
4 files changed, 95 insertions(+), 11 deletions(-)

Index: linux-2.6.18-rc4/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/journal.c 2006-08-23 14:47:25.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/journal.c 2006-08-23 15:59:06.000000000 -0700
@@ -328,10 +328,10 @@
char *tmp;

jbd_unlock_bh_state(bh_in);
- tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
+ tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
jbd_lock_bh_state(bh_in);
if (jh_in->b_frozen_data) {
- kfree(tmp);
+ jbd_slab_free(tmp, bh_in->b_size);
goto repeat;
}

@@ -1612,6 +1612,89 @@
}

/*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs and allocate
+ * frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+static kmem_cache_t *jbd_slab[4];
+static const char *jbd_slab_names[4] = {
+ "jbd_1k",
+ "jbd_2k",
+ "jbd_4k",
+ "jbd_8k",
+};
+
+static void journal_destroy_jbd_slabs(void)
+{
+ int i;
+
+ for (i=0; i<4; i++) {
+ if (jbd_slab[i])
+ kmem_cache_destroy(jbd_slab[i]);
+ jbd_slab[i] = NULL;
+ }
+}
+
+static int journal_init_jbd_slabs(void)
+{
+ int i = 0;
+ int retval = 0;
+
+ for (i=0; i<4; i++) {
+ unsigned long slab_size = 1024 << i;
+ jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+ slab_size, slab_size,
+ 0, NULL, NULL);
+ if (jbd_slab[i] == 0) {
+ journal_destroy_jbd_slabs();
+ retval = -ENOMEM;
+ printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+ goto out;
+ }
+ }
+out:
+ return retval;
+}
+
+static int jbd_find_slab_index(size_t size)
+{
+ int idx = 0;
+
+ switch (size) {
+ case 1024: idx = 0;
+ break;
+ case 2048: idx = 1;
+ break;
+ case 4096: idx = 2;
+ break;
+ case 8192: idx = 3;
+ break;
+ default: printk("JBD unknown slab size %ld\n", size);
+ BUG();
+ }
+ return idx;
+}
+
+void * jbd_slab_alloc(size_t size, gfp_t flags)
+{
+ int idx;
+
+ idx = jbd_find_slab_index(size);
+ return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd_slab_free(void *ptr, size_t size)
+{
+ int idx;
+
+ idx = jbd_find_slab_index(size);
+ kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
* Journal_head storage management
*/
static kmem_cache_t *journal_head_cache;
@@ -1799,13 +1882,13 @@
printk(KERN_WARNING "%s: freeing "
"b_frozen_data\n",
__FUNCTION__);
- kfree(jh->b_frozen_data);
+ jbd_slab_free(jh->b_frozen_data, bh->b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing "
"b_committed_data\n",
__FUNCTION__);
- kfree(jh->b_committed_data);
+ jbd_slab_free(jh->b_committed_data, bh->b_size);
}
bh->b_private = NULL;
jh->b_bh = NULL; /* debug, really */
@@ -1953,6 +2036,8 @@
ret = journal_init_journal_head_cache();
if (ret == 0)
ret = journal_init_handle_cache();
+ if (ret == 0)
+ ret = journal_init_jbd_slabs();
return ret;
}

@@ -1961,6 +2046,7 @@
journal_destroy_revoke_caches();
journal_destroy_journal_head_cache();
journal_destroy_handle_cache();
+ journal_destroy_jbd_slabs();
}

static int __init journal_init(void)
Index: linux-2.6.18-rc4/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/transaction.c 2006-08-23 14:05:57.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/transaction.c 2006-08-23 14:50:17.000000000 -0700
@@ -666,8 +666,9 @@
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
- frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
- GFP_NOFS);
+ frozen_buffer =
+ jbd_slab_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS);
if (!frozen_buffer) {
printk(KERN_EMERG
"%s: OOM for frozen_buffer\n",
@@ -879,7 +880,7 @@

repeat:
if (!jh->b_committed_data) {
- committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS);
+ committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
printk(KERN_EMERG "%s: No memory for committed data\n",
__FUNCTION__);
@@ -906,7 +907,7 @@
out:
journal_put_journal_head(jh);
if (unlikely(committed_data))
- kfree(committed_data);
+ jbd_slab_free(committed_data, bh->b_size);
return err;
}

Index: linux-2.6.18-rc4/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/commit.c 2006-08-23 14:05:57.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/commit.c 2006-08-23 14:50:17.000000000 -0700
@@ -261,7 +261,7 @@
struct buffer_head *bh = jh2bh(jh);

jbd_lock_bh_state(bh);
- kfree(jh->b_committed_data);
+ jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
jbd_unlock_bh_state(bh);
}
@@ -745,14 +745,14 @@
* Otherwise, we can just throw away the frozen data now.
*/
if (jh->b_committed_data) {
- kfree(jh->b_committed_data);
+ jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
}
} else if (jh->b_frozen_data) {
- kfree(jh->b_frozen_data);
+ jbd_slab_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
}

Index: linux-2.6.18-rc4/include/linux/jbd.h
===================================================================
--- linux-2.6.18-rc4.orig/include/linux/jbd.h 2006-08-23 14:05:57.000000000 -0700
+++ linux-2.6.18-rc4/include/linux/jbd.h 2006-08-23 14:50:17.000000000 -0700
@@ -72,6 +72,9 @@
#endif

extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
+extern void * jbd_slab_alloc(size_t size, gfp_t flags);
+extern void jbd_slab_free(void *ptr, size_t size);
+
#define jbd_kmalloc(size, flags) \
__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
#define jbd_rep_kmalloc(size, flags) \





2006-08-23 23:34:32

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] Manage jbd allocations from its own slabs

On Wed, 23 Aug 2006 16:08:15 -0700
Badari Pulavarty <[email protected]> wrote:

> Hi,
>
> Here is the fix to "bh: Ensure bh fits within a page" problem
> caused by JBD.
>
> BTW, I realized that this problem can happen only with 1k, 2k
> filesystems - as 4k, 8k allocations disable slab debug
> automatically. But for completeness, I created slabs for those
> also.
>
> What do you think ? I ran basic tests and things are fine.
>

Thanks for working on this.

> ...
>
> /*
> + * jbd slab management: create 1k, 2k, 4k, 8k slabs and allocate
> + * frozen and commit buffers from these slabs.
> + *
> + * Reason for doing this is to avoid, SLAB_DEBUG - since it could
> + * cause bh to cross page boundary.
> + */
> +
> +static kmem_cache_t *jbd_slab[4];
> +static const char *jbd_slab_names[4] = {
> + "jbd_1k",
> + "jbd_2k",
> + "jbd_4k",
> + "jbd_8k",
> +};
> +
> +static void journal_destroy_jbd_slabs(void)
> +{
> + int i;
> +
> + for (i=0; i<4; i++) {
> + if (jbd_slab[i])
> + kmem_cache_destroy(jbd_slab[i]);
> + jbd_slab[i] = NULL;
> + }
> +}
> +
> +static int journal_init_jbd_slabs(void)
> +{
> + int i = 0;
> + int retval = 0;
> +
> + for (i=0; i<4; i++) {
> + unsigned long slab_size = 1024 << i;
> + jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
> + slab_size, slab_size,
> + 0, NULL, NULL);

OK, passing align=slab_size fixes the bug.

> + if (jbd_slab[i] == 0) {
> + journal_destroy_jbd_slabs();
> + retval = -ENOMEM;
> + printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
> + goto out;
> + }
> + }
> +out:
> + return retval;
> +}

Do we have to create all four slabs up-front? Perhaps we can defer that
until mount-time, after we have determined the filesystem's block size.

That way, most people's machines will only ever create a single slab cache:
jbd_4k.

> +static int jbd_find_slab_index(size_t size)
> +{
> + int idx = 0;
> +
> + switch (size) {
> + case 1024: idx = 0;
> + break;
> + case 2048: idx = 1;
> + break;
> + case 4096: idx = 2;
> + break;
> + case 8192: idx = 3;
> + break;
> + default: printk("JBD unknown slab size %ld\n", size);
> + BUG();
> + }
> + return idx;
> +}

I'd suggest this function be changed to directly return a kmem_cache_t *.

> +void * jbd_slab_alloc(size_t size, gfp_t flags)
> +{
> + int idx;
> +
> + idx = jbd_find_slab_index(size);
> + return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
> +}
> +
> +void jbd_slab_free(void *ptr, size_t size)
> +{
> + int idx;
> +
> + idx = jbd_find_slab_index(size);
> + kmem_cache_free(jbd_slab[idx], ptr);
> +}

Then these become simpler.


2006-08-24 12:29:38

by Dave Kleikamp

[permalink] [raw]
Subject: Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs

On Wed, 2006-08-23 at 16:08 -0700, Badari Pulavarty wrote:
> Hi,
>
> Here is the fix to "bh: Ensure bh fits within a page" problem
> caused by JBD.
>
> BTW, I realized that this problem can happen only with 1k, 2k
> filesystems - as 4k, 8k allocations disable slab debug
> automatically. But for completeness, I created slabs for those
> also.

With a larger base page size, you could run into the same problems for
4K and 8K allocations, so it's a good thing to do them all.

> What do you think ? I ran basic tests and things are fine.

Looks sane to me.

Shaggy
--
David Kleikamp
IBM Linux Technology Center

2006-08-24 12:41:10

by Dave Kleikamp

[permalink] [raw]
Subject: Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs

On Wed, 2006-08-23 at 16:34 -0700, Andrew Morton wrote:
> On Wed, 23 Aug 2006 16:08:15 -0700
> Badari Pulavarty <[email protected]> wrote:

> > /*
> > + * jbd slab management: create 1k, 2k, 4k, 8k slabs and allocate
> > + * frozen and commit buffers from these slabs.
> > + *
> > + * Reason for doing this is to avoid, SLAB_DEBUG - since it could
> > + * cause bh to cross page boundary.
> > + */
> > +
> > +static kmem_cache_t *jbd_slab[4];
> > +static const char *jbd_slab_names[4] = {
> > + "jbd_1k",
> > + "jbd_2k",
> > + "jbd_4k",
> > + "jbd_8k",
> > +};
> > +
> > +static void journal_destroy_jbd_slabs(void)
> > +{
> > + int i;
> > +
> > + for (i=0; i<4; i++) {
> > + if (jbd_slab[i])
> > + kmem_cache_destroy(jbd_slab[i]);
> > + jbd_slab[i] = NULL;
> > + }
> > +}
> > +
> > +static int journal_init_jbd_slabs(void)
> > +{
> > + int i = 0;
> > + int retval = 0;
> > +
> > + for (i=0; i<4; i++) {
> > + unsigned long slab_size = 1024 << i;
> > + jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
> > + slab_size, slab_size,
> > + 0, NULL, NULL);
>
> OK, passing align=slab_size fixes the bug.

The comments above don't mention the alignment of the slabs, so a
comment here may help explain that the alignment is the key to avoiding
the page-straddling. Or you could elaborate above.

Shaggy
--
David Kleikamp
IBM Linux Technology Center

2006-08-24 17:13:05

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs

Dave Kleikamp wrote:
> On Wed, 2006-08-23 at 16:08 -0700, Badari Pulavarty wrote:
>
>> Hi,
>>
>> Here is the fix to "bh: Ensure bh fits within a page" problem
>> caused by JBD.
>>
>> BTW, I realized that this problem can happen only with 1k, 2k
>> filesystems - as 4k, 8k allocations disable slab debug
>> automatically. But for completeness, I created slabs for those
>> also.
>>
>
> With a larger base page size, you could run into the same problems for
> 4K and 8K allocations, so it's a good thing to do them all.
>
>
Yes, with bigger base page size - this can happen for 4k, 8k also.

And also, I am re-doing the patch to address Andrew's comments - I will
send out latest
in few hours (currently testing).

Thanks,
Badari


2006-08-24 18:54:12

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC][PATCH] Manage jbd allocations from its own slabs

On Wed, Aug 23, 2006 at 04:08:15PM -0700, Badari Pulavarty wrote:
> Hi,
>
> Here is the fix to "bh: Ensure bh fits within a page" problem
> caused by JBD.
>
> BTW, I realized that this problem can happen only with 1k, 2k
> filesystems - as 4k, 8k allocations disable slab debug
> automatically. But for completeness, I created slabs for those
> also.
>
> What do you think ? I ran basic tests and things are fine.

Why can't you just use alloc_page? I bet the whole slab overhead
eats more memory than what's wasted when using alloc_pages. Especially
as the typical usecase is a 4k blocks filesystem with 4k pagesize
where the overhead of alloc_page is non-existant.

2006-08-24 19:11:29

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [RFC][PATCH] Manage jbd allocations from its own slabs

Christoph Hellwig wrote:
> On Wed, Aug 23, 2006 at 04:08:15PM -0700, Badari Pulavarty wrote:
>
>> Hi,
>>
>> Here is the fix to "bh: Ensure bh fits within a page" problem
>> caused by JBD.
>>
>> BTW, I realized that this problem can happen only with 1k, 2k
>> filesystems - as 4k, 8k allocations disable slab debug
>> automatically. But for completeness, I created slabs for those
>> also.
>>
>> What do you think ? I ran basic tests and things are fine.
>>
>
> Why can't you just use alloc_page? I bet the whole slab overhead
> eats more memory than what's wasted when using alloc_pages. Especially
> as the typical usecase is a 4k blocks filesystem with 4k pagesize
> where the overhead of alloc_page is non-existant.
>

Yes. That was what proposed earlier. But for 1k, 2k allocations we end
up wasting whole page.
Isn't it ? Thats why I created right sized slabs and disable slab-debug.
I guess, I can do this
only for 1k, 2k filesystems and directly use alloc_page() for 4k and 8k
- but that would make
code ugly and also it doesn't handle cases for bigger base pagesize
systems (64k power).

Thanks,
Badari


2006-08-25 00:56:38

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [RFC][PATCH] Manage jbd allocations from its own slabs

Andrew,

Here is the latest patch. Unfortunately, its not surviving my stress
tests on 1k filesystem. I keep running into

fs/buffer.c: 2791 assert in submit_bh()
BUG_ON(!buffer_mapped(bh));

I haven't touched "bh" itself. So, I am not sure whats happening
here. I am trying to reproduce it on mainline 2.6.18-rc4 (seen
it once so far - but not consistently).

Changes since the last patch:

- create appropriate slabs only when we mount the filesystem with
that blocksize.
- simplify the find slab idx process and get it by shifting size.

Thanks,
Badari

JBD currently allocates commit and frozen buffers from slabs.
With CONFIG_SLAB_DEBUG, its possible for an allocation to
cross the page boundary causing IO problems.

https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=200127

So, instead of allocating these from regular slabs - manage
allocation from its own slabs and disable slab debug for these slabs.

Signed-off-by: Badari Pulavarty <[email protected]>
---
fs/jbd/commit.c | 6 +--
fs/jbd/journal.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
fs/jbd/transaction.c | 9 ++---
include/linux/jbd.h | 3 +
4 files changed, 93 insertions(+), 11 deletions(-)

Index: linux-2.6.18-rc4/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/journal.c 2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/journal.c 2006-08-24 16:19:27.000000000 -0700
@@ -84,6 +84,7 @@

static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
+static int journal_create_jbd_slab(size_t slab_size);

/*
* Helper function used to manage commit timeouts
@@ -328,10 +329,10 @@
char *tmp;

jbd_unlock_bh_state(bh_in);
- tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
+ tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
jbd_lock_bh_state(bh_in);
if (jh_in->b_frozen_data) {
- kfree(tmp);
+ jbd_slab_free(tmp, bh_in->b_size);
goto repeat;
}

@@ -1090,6 +1091,13 @@
}
}

+ /*
+ * Make sure to create a slab for this blocksize
+ */
+ err = journal_create_jbd_slab(cpu_to_be32(journal->j_superblock->s_blocksize));
+ if (err)
+ return err;
+
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
if (journal_recover(journal))
@@ -1612,6 +1620,76 @@
}

/*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
+ * and allocate frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+#define JBD_MAX_SLABS 5
+#define JBD_SLAB_INDEX(size) (size >> 11)
+
+static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+ "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
+};
+
+static void journal_destroy_jbd_slabs(void)
+{
+ int i;
+
+ for (i=0; i<JBD_MAX_SLABS; i++) {
+ if (jbd_slab[i])
+ kmem_cache_destroy(jbd_slab[i]);
+ jbd_slab[i] = NULL;
+ }
+}
+
+static int journal_create_jbd_slab(size_t slab_size)
+{
+ int i = JBD_SLAB_INDEX(slab_size);
+
+ BUG_ON(i >= JBD_MAX_SLABS);
+ /*
+ * Check if we already have a slab created for this size
+ */
+ if (jbd_slab[i])
+ return 0;
+
+ /*
+ * Create a slab and force alignment to be same as slabsize -
+ * this will make sure that allocations won't cross the page
+ * boundary.
+ */
+ jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+ slab_size, slab_size, 0, NULL, NULL);
+ if (!jbd_slab[i]) {
+ printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void * jbd_slab_alloc(size_t size, gfp_t flags)
+{
+ int idx;
+
+ idx = JBD_SLAB_INDEX(size);
+ BUG_ON(jbd_slab[idx] == NULL);
+ return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd_slab_free(void *ptr, size_t size)
+{
+ int idx;
+
+ idx = JBD_SLAB_INDEX(size);
+ BUG_ON(jbd_slab[idx] == NULL);
+ kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
* Journal_head storage management
*/
static kmem_cache_t *journal_head_cache;
@@ -1799,13 +1877,13 @@
printk(KERN_WARNING "%s: freeing "
"b_frozen_data\n",
__FUNCTION__);
- kfree(jh->b_frozen_data);
+ jbd_slab_free(jh->b_frozen_data, bh->b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing "
"b_committed_data\n",
__FUNCTION__);
- kfree(jh->b_committed_data);
+ jbd_slab_free(jh->b_committed_data, bh->b_size);
}
bh->b_private = NULL;
jh->b_bh = NULL; /* debug, really */
@@ -1961,6 +2039,7 @@
journal_destroy_revoke_caches();
journal_destroy_journal_head_cache();
journal_destroy_handle_cache();
+ journal_destroy_jbd_slabs();
}

static int __init journal_init(void)
Index: linux-2.6.18-rc4/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/transaction.c 2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/transaction.c 2006-08-24 13:23:55.000000000 -0700
@@ -666,8 +666,9 @@
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
- frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
- GFP_NOFS);
+ frozen_buffer =
+ jbd_slab_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS);
if (!frozen_buffer) {
printk(KERN_EMERG
"%s: OOM for frozen_buffer\n",
@@ -879,7 +880,7 @@

repeat:
if (!jh->b_committed_data) {
- committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS);
+ committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
printk(KERN_EMERG "%s: No memory for committed data\n",
__FUNCTION__);
@@ -906,7 +907,7 @@
out:
journal_put_journal_head(jh);
if (unlikely(committed_data))
- kfree(committed_data);
+ jbd_slab_free(committed_data, bh->b_size);
return err;
}

Index: linux-2.6.18-rc4/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/commit.c 2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/commit.c 2006-08-24 13:23:55.000000000 -0700
@@ -261,7 +261,7 @@
struct buffer_head *bh = jh2bh(jh);

jbd_lock_bh_state(bh);
- kfree(jh->b_committed_data);
+ jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
jbd_unlock_bh_state(bh);
}
@@ -745,14 +745,14 @@
* Otherwise, we can just throw away the frozen data now.
*/
if (jh->b_committed_data) {
- kfree(jh->b_committed_data);
+ jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
}
} else if (jh->b_frozen_data) {
- kfree(jh->b_frozen_data);
+ jbd_slab_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
}

Index: linux-2.6.18-rc4/include/linux/jbd.h
===================================================================
--- linux-2.6.18-rc4.orig/include/linux/jbd.h 2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/include/linux/jbd.h 2006-08-24 13:23:55.000000000 -0700
@@ -72,6 +72,9 @@
#endif

extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
+extern void * jbd_slab_alloc(size_t size, gfp_t flags);
+extern void jbd_slab_free(void *ptr, size_t size);
+
#define jbd_kmalloc(size, flags) \
__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
#define jbd_rep_kmalloc(size, flags) \



2006-08-25 02:24:07

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs

On Thu, Aug 24, 2006 at 12:11:25PM -0700, Badari Pulavarty wrote:
> > Why can't you just use alloc_page? I bet the whole slab overhead
> > eats more memory than what's wasted when using alloc_pages. Especially
> > as the typical usecase is a 4k blocks filesystem with 4k pagesize
> > where the overhead of alloc_page is non-existant.
>
> Yes. That was what proposed earlier. But for 1k, 2k allocations we
> end up wasting whole page. Isn't it ? Thats why I created right
> sized slabs and disable slab-debug. I guess, I can do this only for
> 1k, 2k filesystems and directly use alloc_page() for 4k and 8k - but
> that would make code ugly and also it doesn't handle cases for
> bigger base pagesize systems (64k power).

Is there some way we can cleanly have a shortcut case where we use
alloc_page() if fs_blocksize == PAGE_SIZE? The efficiency gains for
what will be the common case on many architectures will probably make
this worthwhile....

- Ted