Hi:
Since we frequently run in memory-constrained systems with many partitions,
the ~68K for each partition for the mb_history buffer can be excessive. The
following creates a new proc file under /proc/fs/ext4/ to control the number
of entries at mount time.
If the notion of a history length tunable is okay, but the location should
be under /sys/fs/ext4/ instead of /proc/fs/ext4/, I can change this. The
leftover files under /proc/fs/ext4/<partition>/ are a bit confusing to me.
Thanks,
Curt
Signed-off-by: Curt Wohlgemuth <[email protected]>
---
--- ext4/fs/ext4/mballoc.c.orig 2009-04-07 09:13:12.000000000 -0700
+++ ext4/fs/ext4/mballoc.c 2009-04-07 10:08:05.000000000 -0700
@@ -334,6 +334,10 @@
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
static struct kmem_cache *ext4_free_ext_cachep;
+
+#define DEFAULT_HIST_SIZE 1000
+static int ext4_mbhist_size = DEFAULT_HIST_SIZE;
+
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb,
void *bitmap,
@@ -2417,31 +2421,36 @@ static struct file_operations ext4_mb_se
static void ext4_mb_history_release(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int delete_mbhist = sbi->s_mb_history_max != 0;
if (sbi->s_proc != NULL) {
remove_proc_entry("mb_groups", sbi->s_proc);
- remove_proc_entry("mb_history", sbi->s_proc);
+ if (delete_mbhist)
+ remove_proc_entry("mb_history", sbi->s_proc);
}
- kfree(sbi->s_mb_history);
+ if (delete_mbhist)
+ kfree(sbi->s_mb_history);
}
static void ext4_mb_history_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
+ int create_mbhist = ext4_mbhist_size != 0;
if (sbi->s_proc != NULL) {
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
+ if (create_mbhist)
+ proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_history_fops, sb);
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
}
- sbi->s_mb_history_max = 1000;
+ sbi->s_mb_history_max = ext4_mbhist_size;
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+ sbi->s_mb_history = create_mbhist ? kzalloc(i, GFP_KERNEL) : NULL;
/* if we can't allocate history, then we simple won't use it */
}
@@ -2894,6 +2903,49 @@ static void release_blocks_on_commit(jou
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
+static ssize_t read_mb_hist_size(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char buffer[20];
+ size_t len;
+
+ len = snprintf(buffer, sizeof(buffer), "%i\n", ext4_mbhist_size);
+
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t write_mb_hist_size(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ /* This allows for 99999 entries, at 68 bytes each */
+#define MAX_BUFSIZE 6
+ char kbuf[MAX_BUFSIZE + 1];
+ int value;
+
+ if (count) {
+ if (count > MAX_BUFSIZE)
+ return -EINVAL;
+ if (copy_from_user(&kbuf, buf, count))
+ return -EFAULT;
+ kbuf[min(count, sizeof(kbuf))-1] = '\0';
+
+
+ value = simple_strtol(kbuf, NULL, 0);
+
+ if (value < 0)
+ return -EINVAL;
+
+ ext4_mbhist_size = value;
+ }
+ return count;
+#undef MAX_BUFSIZE
+}
+
+static struct file_operations ext4_mb_size_fops = {
+ .read = read_mb_hist_size,
+ .write = write_mb_hist_size,
+};
+
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -2912,6 +2964,9 @@ int __init init_ext4_mballoc(void)
return -ENOMEM;
}
+ proc_create("mb_hist_size", S_IRUGO | S_IWUGO, ext4_proc_root,
+ &ext4_mb_size_fops);
+
ext4_free_ext_cachep =
kmem_cache_create("ext4_free_block_extents",
sizeof(struct ext4_free_data),
On Tue, Apr 7, 2009 at 10:20 AM, Curt Wohlgemuth <[email protected]> wrote:
> Hi:
>
> Since we frequently run in memory-constrained systems with many partitions,
> the ~68K for each partition for the mb_history buffer can be excessive. ?The
> following creates a new proc file under /proc/fs/ext4/ to control the number
> of entries at mount time.
>
> If the notion of a history length tunable is okay, but the location should
> be under /sys/fs/ext4/ instead of /proc/fs/ext4/, I can change this. ?The
> leftover files under /proc/fs/ext4/<partition>/ are a bit confusing to me.
>
Does the silence mean that there is no interest in this CL?
We thought making this a run time tunable would be cool.
mrubin
On Sat, Apr 18, 2009 at 12:51:43AM -0700, Michael Rubin wrote:
> On Tue, Apr 7, 2009 at 10:20 AM, Curt Wohlgemuth <[email protected]> wrote:
> > Hi:
> >
> > Since we frequently run in memory-constrained systems with many partitions,
> > the ~68K for each partition for the mb_history buffer can be excessive. ?The
> > following creates a new proc file under /proc/fs/ext4/ to control the number
> > of entries at mount time.
> >
> > If the notion of a history length tunable is okay, but the location should
> > be under /sys/fs/ext4/ instead of /proc/fs/ext4/, I can change this. ?The
> > leftover files under /proc/fs/ext4/<partition>/ are a bit confusing to me.
> >
>
> Does the silence mean that there is no interest in this CL?
No, just that you hit the ext4 dev's during the Linux storage and
filesystem workshop, and some of us are still trying to catch up from
that. This does seem like a reasonable patch.
Yes, it should be in /sys/fs/ext4 (and this should actually be easier
to support than /proc/fs/ext4). I've left some stuff under /proc
because /sys uses the paradigm of returning a single file per
individual tunable, which works great for most things, but it's not so
hot for things like /proc/fs/ext4/<dev>/mb_history. However, tunables
should go under /sys/fs/ext4.
- Ted
On Apr 07, 2009 10:20 -0700, Curt Wohlgemuth wrote:
> Since we frequently run in memory-constrained systems with many partitions,
> the ~68K for each partition for the mb_history buffer can be excessive. The
> following creates a new proc file under /proc/fs/ext4/ to control the number
> of entries at mount time.
Sorry for the delay, it's been a hectic 2 weeks. I'm totally OK with this
patch.
> If the notion of a history length tunable is okay, but the location should
> be under /sys/fs/ext4/ instead of /proc/fs/ext4/, I can change this. The
> leftover files under /proc/fs/ext4/<partition>/ are a bit confusing to me.
Complex /proc files like this cannot be moved into /sys because it
does not support the seqfile interface for having output span more
than one page.
> ---
> --- ext4/fs/ext4/mballoc.c.orig 2009-04-07 09:13:12.000000000 -0700
> +++ ext4/fs/ext4/mballoc.c 2009-04-07 10:08:05.000000000 -0700
> @@ -334,6 +334,10 @@
> static struct kmem_cache *ext4_pspace_cachep;
> static struct kmem_cache *ext4_ac_cachep;
> static struct kmem_cache *ext4_free_ext_cachep;
> +
> +#define DEFAULT_HIST_SIZE 1000
> +static int ext4_mbhist_size = DEFAULT_HIST_SIZE;
> +
> static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> ext4_group_t group);
> static void ext4_mb_generate_from_freelist(struct super_block *sb,
> void *bitmap,
> @@ -2417,31 +2421,36 @@ static struct file_operations ext4_mb_se
> static void ext4_mb_history_release(struct super_block *sb)
> {
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> + int delete_mbhist = sbi->s_mb_history_max != 0;
>
> if (sbi->s_proc != NULL) {
> remove_proc_entry("mb_groups", sbi->s_proc);
> - remove_proc_entry("mb_history", sbi->s_proc);
> + if (delete_mbhist)
> + remove_proc_entry("mb_history", sbi->s_proc);
> }
> - kfree(sbi->s_mb_history);
> + if (delete_mbhist)
> + kfree(sbi->s_mb_history);
If sbi->s_mb_history == NULL (as opposed to garbage) it is OK to
just call kfree() without the extra check.
> static void ext4_mb_history_init(struct super_block *sb)
> {
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> int i;
> + int create_mbhist = ext4_mbhist_size != 0;
>
> if (sbi->s_proc != NULL) {
> - proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
> - &ext4_mb_seq_history_fops, sb);
> + if (create_mbhist)
> + proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
> + &ext4_mb_seq_history_fops, sb);
> proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
> &ext4_mb_seq_groups_fops, sb);
> }
>
> - sbi->s_mb_history_max = 1000;
> + sbi->s_mb_history_max = ext4_mbhist_size;
> sbi->s_mb_history_cur = 0;
> spin_lock_init(&sbi->s_mb_history_lock);
> i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
> - sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
> + sbi->s_mb_history = create_mbhist ? kzalloc(i, GFP_KERNEL) : NULL;
> /* if we can't allocate history, then we simple won't use it */
> }
Since the s_mb_history buffers are allocated at mount time only,
and later writing to mb_hist_size does not affect their size,
it means that mb_hist_size must be set before the filesystems are
mounted. However, that makes it impossible to tune the root
filesystem history size, and at best it is difficult to tune the
other filesystems because the value has to be set by an init
script after root mounts but before other filesystems mount.
Another possible option is to have a module parameter that can be
set when the ext4 module is inserted, so that it is sure to be
available for all filesystems, but this won't help if ext4 is
built into the kernel.
The final (though more complex) option is to add a per-fs
tunable that allows changing the history size at runtime for
each filesystem.
> @@ -2894,6 +2903,49 @@ static void release_blocks_on_commit(jou
> mb_debug("freed %u blocks in %u structures\n", count, count2);
> }
>
> +static ssize_t read_mb_hist_size(struct file *file, char __user *buf,
> + size_t count, loff_t *ppos)
> +static ssize_t write_mb_hist_size(struct file *file, const char __user *buf,
> + size_t count, loff_t *ppos)
Please add an ext4_ prefix to these function names.
> +{
> + /* This allows for 99999 entries, at 68 bytes each */
> +#define MAX_BUFSIZE 6
> + char kbuf[MAX_BUFSIZE + 1];
> + int value;
> +
> + if (count) {
> + if (count > MAX_BUFSIZE)
> + return -EINVAL;
> + if (copy_from_user(&kbuf, buf, count))
> + return -EFAULT;
> + kbuf[min(count, sizeof(kbuf))-1] = '\0';
This is could probably be avoided. Simply initializing "kbuf" at
declaration time will ensure that there is a trailing NUL because
at most MAX_BUFSIZE bytes are copied into it.
char kbuf[MAX_BUFSIZE + 1] = "";
> + value = simple_strtol(kbuf, NULL, 0);
> +
> + if (value < 0)
> + return -EINVAL;
> +
> + ext4_mbhist_size = value;
> + }
> + return count;
> +#undef MAX_BUFSIZE
> +}
> +
> +static struct file_operations ext4_mb_size_fops = {
> + .read = read_mb_hist_size,
> + .write = write_mb_hist_size,
> +};
> +
> int __init init_ext4_mballoc(void)
> {
> ext4_pspace_cachep =
> @@ -2912,6 +2964,9 @@ int __init init_ext4_mballoc(void)
> return -ENOMEM;
> }
>
> + proc_create("mb_hist_size", S_IRUGO | S_IWUGO, ext4_proc_root,
> + &ext4_mb_size_fops);
> +
> ext4_free_ext_cachep =
> kmem_cache_create("ext4_free_block_extents",
> sizeof(struct ext4_free_data),
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
Hi Andreas:
> On Apr 07, 2009 10:20 -0700, Curt Wohlgemuth wrote:
> > Since we frequently run in memory-constrained systems with many partitions,
> > the ~68K for each partition for the mb_history buffer can be excessive. The
> > following creates a new proc file under /proc/fs/ext4/ to control the number
> > of entries at mount time.
>
> Sorry for the delay, it's been a hectic 2 weeks. I'm totally OK with this
> patch.
Thanks for the detailed reply.
> > If the notion of a history length tunable is okay, but the location should
> > be under /sys/fs/ext4/ instead of /proc/fs/ext4/, I can change this. The
> > leftover files under /proc/fs/ext4/<partition>/ are a bit confusing to me.
>
> Complex /proc files like this cannot be moved into /sys because it
> does not support the seqfile interface for having output span more
> than one page.
Got it.
> > static void ext4_mb_generate_from_freelist(struct super_block *sb,
> > void *bitmap,
> > @@ -2417,31 +2421,36 @@ static struct file_operations ext4_mb_se
> > static void ext4_mb_history_release(struct super_block *sb)
> > {
> > struct ext4_sb_info *sbi = EXT4_SB(sb);
> > + int delete_mbhist = sbi->s_mb_history_max != 0;
> >
> > if (sbi->s_proc != NULL) {
> > remove_proc_entry("mb_groups", sbi->s_proc);
> > - remove_proc_entry("mb_history", sbi->s_proc);
> > + if (delete_mbhist)
> > + remove_proc_entry("mb_history", sbi->s_proc);
> > }
> > - kfree(sbi->s_mb_history);
> > + if (delete_mbhist)
> > + kfree(sbi->s_mb_history);
>
> If sbi->s_mb_history == NULL (as opposed to garbage) it is OK to
> just call kfree() without the extra check.
Done.
> Since the s_mb_history buffers are allocated at mount time only,
> and later writing to mb_hist_size does not affect their size,
> it means that mb_hist_size must be set before the filesystems are
> mounted. However, that makes it impossible to tune the root
> filesystem history size, and at best it is difficult to tune the
> other filesystems because the value has to be set by an init
> script after root mounts but before other filesystems mount.
>
> Another possible option is to have a module parameter that can be
> set when the ext4 module is inserted, so that it is sure to be
> available for all filesystems, but this won't help if ext4 is
> built into the kernel.
Which is certainly our case.
> The final (though more complex) option is to add a per-fs
> tunable that allows changing the history size at runtime for
> each filesystem.
Right, I thought about this. But avoiding the race when the tunable is
being changed at the same time that history is being updated just didn't
seem worth it to me.
I see the mb_history mostly as a debug issue; having to umount/remount a FS
to change history doesn't seem too onerous -- with the exception, of course,
of the root FS. Do you think this is too big a burden, and I should look at
handling it while a FS is mounted?
> > @@ -2894,6 +2903,49 @@ static void release_blocks_on_commit(jou
> > mb_debug("freed %u blocks in %u structures\n", count, count2);
> > }
> >
> > +static ssize_t read_mb_hist_size(struct file *file, char __user *buf,
> > + size_t count, loff_t *ppos)
> > +static ssize_t write_mb_hist_size(struct file *file, const char __user *buf,
> > + size_t count, loff_t *ppos)
>
> Please add an ext4_ prefix to these function names.
Done.
> > +{
> > + /* This allows for 99999 entries, at 68 bytes each */
> > +#define MAX_BUFSIZE 6
> > + char kbuf[MAX_BUFSIZE + 1];
> > + int value;
> > +
> > + if (count) {
> > + if (count > MAX_BUFSIZE)
> > + return -EINVAL;
> > + if (copy_from_user(&kbuf, buf, count))
> > + return -EFAULT;
> > + kbuf[min(count, sizeof(kbuf))-1] = '\0';
>
> This is could probably be avoided. Simply initializing "kbuf" at
> declaration time will ensure that there is a trailing NUL because
> at most MAX_BUFSIZE bytes are copied into it.
>
> char kbuf[MAX_BUFSIZE + 1] = "";
Ah, right, thanks; done.
Second patch follows.
Signed-off-by: Curt Wohlgemuth <[email protected]>
---
--- ext4/fs/ext4/mballoc.c.orig 2009-04-07 09:13:12.000000000 -0700
+++ ext4/fs/ext4/mballoc.c 2009-04-20 11:22:25.000000000 -0700
@@ -334,6 +334,10 @@
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
static struct kmem_cache *ext4_free_ext_cachep;
+
+#define DEFAULT_HIST_SIZE 1000
+static int ext4_mbhist_size = DEFAULT_HIST_SIZE;
+
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb,
void *bitmap,
@@ -2417,10 +2421,12 @@
static void ext4_mb_history_release(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int delete_mbhist = sbi->s_mb_history_max != 0;
if (sbi->s_proc != NULL) {
remove_proc_entry("mb_groups", sbi->s_proc);
- remove_proc_entry("mb_history", sbi->s_proc);
+ if (delete_mbhist)
+ remove_proc_entry("mb_history", sbi->s_proc);
}
kfree(sbi->s_mb_history);
}
@@ -2429,19 +2435,21 @@
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
+ int create_mbhist = ext4_mbhist_size != 0;
if (sbi->s_proc != NULL) {
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
+ if (create_mbhist)
+ proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_history_fops, sb);
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
}
- sbi->s_mb_history_max = 1000;
+ sbi->s_mb_history_max = ext4_mbhist_size;
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+ sbi->s_mb_history = create_mbhist ? kzalloc(i, GFP_KERNEL) : NULL;
/* if we can't allocate history, then we simple won't use it */
}
@@ -2894,6 +2902,48 @@
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
+static ssize_t ext4_read_mb_hist_size(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char buffer[20];
+ size_t len;
+
+ len = snprintf(buffer, sizeof(buffer), "%i\n", ext4_mbhist_size);
+
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t ext4_write_mb_hist_size(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ /* This allows for 99999 entries, at 68 bytes each */
+#define MAX_BUFSIZE 6
+ char kbuf[MAX_BUFSIZE + 1] = "";
+ int value;
+
+ if (count) {
+ if (count > MAX_BUFSIZE)
+ return -EINVAL;
+ if (copy_from_user(&kbuf, buf, count))
+ return -EFAULT;
+
+ value = simple_strtol(kbuf, NULL, 0);
+
+ if (value < 0)
+ return -EINVAL;
+
+ ext4_mbhist_size = value;
+ }
+ return count;
+#undef MAX_BUFSIZE
+}
+
+static struct file_operations ext4_mb_size_fops = {
+ .read = ext4_read_mb_hist_size,
+ .write = ext4_write_mb_hist_size,
+};
+
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -2912,6 +2962,9 @@
return -ENOMEM;
}
+ proc_create("mb_hist_size", S_IRUGO | S_IWUGO, ext4_proc_root,
+ &ext4_mb_size_fops);
+
ext4_free_ext_cachep =
kmem_cache_create("ext4_free_block_extents",
sizeof(struct ext4_free_data),
On Mon, Apr 20, 2009 at 11:53:12AM -0700, Curt Wohlgemuth wrote:
>
> > Since the s_mb_history buffers are allocated at mount time only,
> > and later writing to mb_hist_size does not affect their size,
> > it means that mb_hist_size must be set before the filesystems are
> > mounted. However, that makes it impossible to tune the root
> > filesystem history size, and at best it is difficult to tune the
> > other filesystems because the value has to be set by an init
> > script after root mounts but before other filesystems mount.
If it's only going to be allocated at mount-time (as opposed doing it
at run-time --- where for simplicities sake I would just simply flush
the existing history when resizing the mb_history buffers), I would
make this a mount option. This would allow it to be set for the root
filesystem via a boot option (rootfsflags).
> Right, I thought about this. But avoiding the race when the tunable is
> being changed at the same time that history is being updated just didn't
> seem worth it to me.
Fixing this wouldn't be that hard; I would use read-copy-update (RCU)
and just free the old history when changing the size of the
mb_history. But setting it as a mount option would also be fine, I
think.
- Ted
From: Curt Wohlgemuth <[email protected]>
In memory-constrained systems with many partitions, the ~68K for each
partition for the mb_history buffer can be excessive.
This patch adds a new mount option, mb_history_length, as well as a
way of setting the default via a module parameter (or via a sysfs
parameter in /sys/module/ext4/parameter/default_mb_history_length).
If the mb_history_length is set to zero, the mb_history facility is
disabled entirely.
Signed-off-by: Curt Wohlgemuth <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
I completely revamped how how tunable is set. The combination of a
mount option plus a default which can be tuned via a sysfs file or a
module parameter is cleaner, and certainly requires less code. (No need
to open-code new file in /proc, which should be avoided if at all
possible.)
Anyway, this is what I've dropped into the ext4 patch queue.
fs/ext4/mballoc.c | 13 +++++++------
fs/ext4/super.c | 18 +++++++++++++++++-
2 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbd47ea..df75855 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb)
if (sbi->s_proc != NULL) {
remove_proc_entry("mb_groups", sbi->s_proc);
- remove_proc_entry("mb_history", sbi->s_proc);
+ if (sbi->s_mb_history_max)
+ remove_proc_entry("mb_history", sbi->s_proc);
}
kfree(sbi->s_mb_history);
}
@@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb)
int i;
if (sbi->s_proc != NULL) {
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
+ if (sbi->s_mb_history_max)
+ proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_history_fops, sb);
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
}
- sbi->s_mb_history_max = 1000;
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+ sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
/* if we can't allocate history, then we simple won't use it */
}
@@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_mb_history h;
- if (unlikely(sbi->s_mb_history == NULL))
+ if (sbi->s_mb_history == NULL)
return;
if (!(ac->ac_op & sbi->s_mb_history_filter))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7903f20..39223a5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -47,6 +47,13 @@
#include "xattr.h"
#include "acl.h"
+static int default_mb_history_length = 1000;
+
+module_param_named(default_mb_history_length, default_mb_history_length,
+ int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+ "Default number of entries saved for mb_history");
+
struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
@@ -1042,7 +1049,7 @@ enum {
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore,
+ Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1088,6 +1095,7 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
+ {Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break;
+ case Opt_mb_history_length:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0)
+ return 0;
+ sbi->s_mb_history_max = option;
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER);
--
1.6.0.4
Hi Ted:
Thanks for sending this out. It was on my list to re-vamp the patch
to use your suggestion of a mount option, but I didn't get a chance.
I tested it on our 2.6.26+patches kernel, and it works fine.
Curt
On Fri, May 1, 2009 at 5:31 PM, Theodore Ts'o <[email protected]> wrote:
> From: Curt Wohlgemuth <[email protected]>
>
> In memory-constrained systems with many partitions, the ~68K for each
> partition for the mb_history buffer can be excessive.
>
> This patch adds a new mount option, mb_history_length, as well as a
> way of setting the default via a module parameter (or via a sysfs
> parameter in /sys/module/ext4/parameter/default_mb_history_length).
> If the mb_history_length is set to zero, the mb_history facility is
> disabled entirely.
>
> Signed-off-by: Curt Wohlgemuth <[email protected]>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
>
> I completely revamped how how tunable is set. ?The combination of a
> mount option plus a default which can be tuned via a sysfs file or a
> module parameter is cleaner, and certainly requires less code. ?(No need
> to open-code new file in /proc, which should be avoided if at all
> possible.)
>
> Anyway, this is what I've dropped into the ext4 patch queue.
>
> ?fs/ext4/mballoc.c | ? 13 +++++++------
> ?fs/ext4/super.c ? | ? 18 +++++++++++++++++-
> ?2 files changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index dbd47ea..df75855 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb)
>
> ? ? ? ?if (sbi->s_proc != NULL) {
> ? ? ? ? ? ? ? ?remove_proc_entry("mb_groups", sbi->s_proc);
> - ? ? ? ? ? ? ? remove_proc_entry("mb_history", sbi->s_proc);
> + ? ? ? ? ? ? ? if (sbi->s_mb_history_max)
> + ? ? ? ? ? ? ? ? ? ? ? remove_proc_entry("mb_history", sbi->s_proc);
> ? ? ? ?}
> ? ? ? ?kfree(sbi->s_mb_history);
> ?}
> @@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb)
> ? ? ? ?int i;
>
> ? ? ? ?if (sbi->s_proc != NULL) {
> - ? ? ? ? ? ? ? proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?&ext4_mb_seq_history_fops, sb);
> + ? ? ? ? ? ? ? if (sbi->s_mb_history_max)
> + ? ? ? ? ? ? ? ? ? ? ? proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?&ext4_mb_seq_history_fops, sb);
> ? ? ? ? ? ? ? ?proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? &ext4_mb_seq_groups_fops, sb);
> ? ? ? ?}
>
> - ? ? ? sbi->s_mb_history_max = 1000;
> ? ? ? ?sbi->s_mb_history_cur = 0;
> ? ? ? ?spin_lock_init(&sbi->s_mb_history_lock);
> ? ? ? ?i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
> - ? ? ? sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
> + ? ? ? sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
> ? ? ? ?/* if we can't allocate history, then we simple won't use it */
> ?}
>
> @@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
> ? ? ? ?struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> ? ? ? ?struct ext4_mb_history h;
>
> - ? ? ? if (unlikely(sbi->s_mb_history == NULL))
> + ? ? ? if (sbi->s_mb_history == NULL)
> ? ? ? ? ? ? ? ?return;
>
> ? ? ? ?if (!(ac->ac_op & sbi->s_mb_history_filter))
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 7903f20..39223a5 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -47,6 +47,13 @@
> ?#include "xattr.h"
> ?#include "acl.h"
>
> +static int default_mb_history_length = 1000;
> +
> +module_param_named(default_mb_history_length, default_mb_history_length,
> + ? ? ? ? ? ? ? ? ?int, 0644);
> +MODULE_PARM_DESC(default_mb_history_length,
> + ? ? ? ? ? ? ? ?"Default number of entries saved for mb_history");
> +
> ?struct proc_dir_entry *ext4_proc_root;
> ?static struct kset *ext4_kset;
>
> @@ -1042,7 +1049,7 @@ enum {
> ? ? ? ?Opt_journal_update, Opt_journal_dev,
> ? ? ? ?Opt_journal_checksum, Opt_journal_async_commit,
> ? ? ? ?Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
> - ? ? ? Opt_data_err_abort, Opt_data_err_ignore,
> + ? ? ? Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
> ? ? ? ?Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
> ? ? ? ?Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
> ? ? ? ?Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
> @@ -1088,6 +1095,7 @@ static const match_table_t tokens = {
> ? ? ? ?{Opt_data_writeback, "data=writeback"},
> ? ? ? ?{Opt_data_err_abort, "data_err=abort"},
> ? ? ? ?{Opt_data_err_ignore, "data_err=ignore"},
> + ? ? ? {Opt_mb_history_length, "mb_history_length=%u"},
> ? ? ? ?{Opt_offusrjquota, "usrjquota="},
> ? ? ? ?{Opt_usrjquota, "usrjquota=%s"},
> ? ? ? ?{Opt_offgrpjquota, "grpjquota="},
> @@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb,
> ? ? ? ? ? ? ? ?case Opt_data_err_ignore:
> ? ? ? ? ? ? ? ? ? ? ? ?clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
> ? ? ? ? ? ? ? ? ? ? ? ?break;
> + ? ? ? ? ? ? ? case Opt_mb_history_length:
> + ? ? ? ? ? ? ? ? ? ? ? if (match_int(&args[0], &option))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return 0;
> + ? ? ? ? ? ? ? ? ? ? ? if (option < 0)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return 0;
> + ? ? ? ? ? ? ? ? ? ? ? sbi->s_mb_history_max = option;
> + ? ? ? ? ? ? ? ? ? ? ? break;
> ?#ifdef CONFIG_QUOTA
> ? ? ? ? ? ? ? ?case Opt_usrjquota:
> ? ? ? ? ? ? ? ? ? ? ? ?qtype = USRQUOTA;
> @@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> ? ? ? ?sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> ? ? ? ?sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
> ? ? ? ?sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
> + ? ? ? sbi->s_mb_history_max = default_mb_history_length;
>
> ? ? ? ?set_opt(sbi->s_mount_opt, BARRIER);
>
> --
> 1.6.0.4
>
>