2008-01-29 15:50:54

by Miklos Szeredi

[permalink] [raw]
Subject: [patch 2/6] mm: bdi: export BDI attributes in sysfs

From: Peter Zijlstra <[email protected]>

Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
object. This allows us to see and set the various BDI specific
variables.

In particular this properly exposes the read-ahead window for all
relevant users and /sys/block/<block>/queue/read_ahead_kb should be
deprecated.

With patient help from Kay Sievers and Greg KH

[[email protected]]

- split off NFS and FUSE changes into separate patches
- document new sysfs attributes under Documentation/ABI
- do bdi_class_init as a core_initcall, otherwise the "default" BDI
won't be initialized
- remove bdi_init_fmt macro, it's not used very much

Signed-off-by: Peter Zijlstra <[email protected]>
CC: Kay Sievers <[email protected]>
CC: Greg KH <[email protected]>
CC: Trond Myklebust <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---

Index: linux/block/genhd.c
===================================================================
--- linux.orig/block/genhd.c 2008-01-29 13:02:41.000000000 +0100
+++ linux/block/genhd.c 2008-01-29 13:02:46.000000000 +0100
@@ -183,6 +183,8 @@ void add_disk(struct gendisk *disk)
disk->minors, NULL, exact_match, exact_lock, disk);
register_disk(disk);
blk_register_queue(disk);
+ bdi_register(&disk->queue->backing_dev_info, NULL,
+ "blk-%s", disk->disk_name);
}

EXPORT_SYMBOL(add_disk);
@@ -191,6 +193,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partit
void unlink_gendisk(struct gendisk *disk)
{
blk_unregister_queue(disk);
+ bdi_unregister(&disk->queue->backing_dev_info);
blk_unregister_region(MKDEV(disk->major, disk->first_minor),
disk->minors);
}
Index: linux/include/linux/backing-dev.h
===================================================================
--- linux.orig/include/linux/backing-dev.h 2008-01-29 13:02:41.000000000 +0100
+++ linux/include/linux/backing-dev.h 2008-01-29 13:02:46.000000000 +0100
@@ -11,6 +11,8 @@
#include <linux/percpu_counter.h>
#include <linux/log2.h>
#include <linux/proportions.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
#include <asm/atomic.h>

struct page;
@@ -48,11 +50,17 @@ struct backing_dev_info {

struct prop_local_percpu completions;
int dirty_exceeded;
+
+ struct device *dev;
};

int bdi_init(struct backing_dev_info *bdi);
void bdi_destroy(struct backing_dev_info *bdi);

+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...);
+void bdi_unregister(struct backing_dev_info *bdi);
+
static inline void __add_bdi_stat(struct backing_dev_info *bdi,
enum bdi_stat_item item, s64 amount)
{
Index: linux/include/linux/writeback.h
===================================================================
--- linux.orig/include/linux/writeback.h 2008-01-29 13:02:41.000000000 +0100
+++ linux/include/linux/writeback.h 2008-01-29 13:02:46.000000000 +0100
@@ -113,6 +113,9 @@ struct file;
int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);

+void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+ struct backing_dev_info *bdi);
+
void page_writeback_init(void);
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied);
Index: linux/mm/backing-dev.c
===================================================================
--- linux.orig/mm/backing-dev.c 2008-01-29 13:02:41.000000000 +0100
+++ linux/mm/backing-dev.c 2008-01-29 13:03:23.000000000 +0100
@@ -4,12 +4,118 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+
+
+static struct class *bdi_class;
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+
+ bdi->ra_pages = simple_strtoul(buf, &end, 10) >> (PAGE_SHIFT - 10);
+
+ return end - buf;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+BDI_SHOW(reclaimable_kb, K(bdi_stat(bdi, BDI_RECLAIMABLE)))
+BDI_SHOW(writeback_kb, K(bdi_stat(bdi, BDI_WRITEBACK)))
+
+static inline unsigned long get_dirty(struct backing_dev_info *bdi, int i)
+{
+ unsigned long thresh[3];
+
+ get_dirty_limits(&thresh[0], &thresh[1], &thresh[2], bdi);
+
+ return thresh[i];
+}
+
+BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1)))
+BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2)))
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
+static struct device_attribute bdi_dev_attrs[] = {
+ __ATTR_RW(read_ahead_kb),
+ __ATTR_RO(reclaimable_kb),
+ __ATTR_RO(writeback_kb),
+ __ATTR_RO(dirty_kb),
+ __ATTR_RO(bdi_dirty_kb),
+ __ATTR_NULL,
+};
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ bdi_class->dev_attrs = bdi_dev_attrs;
+ return 0;
+}
+
+core_initcall(bdi_class_init);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ char *name;
+ va_list args;
+ int ret = 0;
+ struct device *dev;
+
+ va_start(args, fmt);
+ name = kvasprintf(GFP_KERNEL, fmt, args);
+ va_end(args);
+
+ if (!name)
+ return -ENOMEM;
+
+ dev = device_create(bdi_class, parent, MKDEV(0, 0), name);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto exit;
+ }
+
+ bdi->dev = dev;
+ dev_set_drvdata(bdi->dev, bdi);
+
+exit:
+ kfree(name);
+ return ret;
+}
+EXPORT_SYMBOL(bdi_register);
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ if (bdi->dev) {
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+}
+EXPORT_SYMBOL(bdi_unregister);

int bdi_init(struct backing_dev_info *bdi)
{
int i;
int err;

+ bdi->dev = NULL;
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
if (err)
@@ -33,6 +139,8 @@ void bdi_destroy(struct backing_dev_info
{
int i;

+ bdi_unregister(bdi);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);

Index: linux/mm/page-writeback.c
===================================================================
--- linux.orig/mm/page-writeback.c 2008-01-29 13:02:41.000000000 +0100
+++ linux/mm/page-writeback.c 2008-01-29 13:02:46.000000000 +0100
@@ -304,7 +304,7 @@ static unsigned long determine_dirtyable
return x + 1; /* Ensure that we never return 0 */
}

-static void
+void
get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
struct backing_dev_info *bdi)
{
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c 2008-01-29 13:02:41.000000000 +0100
+++ linux/lib/percpu_counter.c 2008-01-29 13:02:46.000000000 +0100
@@ -102,6 +102,7 @@ void percpu_counter_destroy(struct percp
return;

free_percpu(fbc->counters);
+ fbc->counters = NULL;
#ifdef CONFIG_HOTPLUG_CPU
mutex_lock(&percpu_counters_lock);
list_del(&fbc->list);
Index: linux/mm/readahead.c
===================================================================
--- linux.orig/mm/readahead.c 2008-01-29 13:02:41.000000000 +0100
+++ linux/mm/readahead.c 2008-01-29 13:02:46.000000000 +0100
@@ -235,7 +235,13 @@ unsigned long max_sane_readahead(unsigne

static int __init readahead_init(void)
{
- return bdi_init(&default_backing_dev_info);
+ int err;
+
+ err = bdi_init(&default_backing_dev_info);
+ if (!err)
+ bdi_register(&default_backing_dev_info, NULL, "default");
+
+ return err;
}
subsys_initcall(readahead_init);

Index: linux/Documentation/ABI/testing/sysfs-class-bdi
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-01-29 13:02:46.000000000 +0100
@@ -0,0 +1,50 @@
+What: /sys/class/bdi/<bdi>/
+Date: January 2008
+Contact: Peter Zijlstra <[email protected]>
+Description:
+
+Provide a place in sysfs for the backing_dev_info object.
+This allows us to see and set the various BDI specific variables.
+
+The <bdi> identifyer can take the following forms:
+
+blk-NAME
+
+ Block devices, NAME is 'sda', 'loop0', etc...
+
+FSTYPE-MAJOR:MINOR
+
+ Non-block device backed filesystems which provide their own
+ BDI, such as NFS and FUSE. MAJOR:MINOR is the value of st_dev
+ for files on this filesystem.
+
+default
+
+ The default backing dev, used for non-block device backed
+ filesystems which do not provide their own BDI.
+
+Files under /sys/class/bdi/<bdi>/
+---------------------------------
+
+read_ahead_kb (read-write)
+
+ Size of the read-ahead window in kilobytes
+
+reclaimable_kb (read-only)
+
+ Reclaimable (dirty or unstable) memory destined for writeback
+ to this device
+
+writeback_kb (read-only)
+
+ Memory currently under writeback to this device
+
+dirty_kb (read-only)
+
+ Global threshold for reclaimable + writeback memory
+
+bdi_dirty_kb (read-only)
+
+ Current threshold on this BDI for reclaimable + writeback
+ memory
+

--


2008-01-29 17:40:59

by Greg KH

[permalink] [raw]
Subject: Re: [patch 2/6] mm: bdi: export BDI attributes in sysfs

On Tue, Jan 29, 2008 at 04:49:02PM +0100, Miklos Szeredi wrote:
> From: Peter Zijlstra <[email protected]>
>
> Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
> object. This allows us to see and set the various BDI specific
> variables.
>
> In particular this properly exposes the read-ahead window for all
> relevant users and /sys/block/<block>/queue/read_ahead_kb should be
> deprecated.
>
> With patient help from Kay Sievers and Greg KH
>
> [[email protected]]
>
> - split off NFS and FUSE changes into separate patches
> - document new sysfs attributes under Documentation/ABI
> - do bdi_class_init as a core_initcall, otherwise the "default" BDI
> won't be initialized
> - remove bdi_init_fmt macro, it's not used very much
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> CC: Kay Sievers <[email protected]>
> CC: Greg KH <[email protected]>

Acked-by: Greg Kroah-Hartman <[email protected]>

2008-01-31 00:29:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 2/6] mm: bdi: export BDI attributes in sysfs

On Tue, 29 Jan 2008 16:49:02 +0100
Miklos Szeredi <[email protected]> wrote:

> From: Peter Zijlstra <[email protected]>
>
> Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
> object. This allows us to see and set the various BDI specific
> variables.
>
> In particular this properly exposes the read-ahead window for all
> relevant users and /sys/block/<block>/queue/read_ahead_kb should be
> deprecated.

This description is not complete. It implies that the readahead window is
not "properly" exposed for some "relevant" users. The reader is left
wondering what on earth this is referring to. I certainly don't know.
Perhaps when this information is revealed, we can work out what was
wrong with per-queue readahead tuning.

> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-01-29 13:02:46.000000000 +0100
> @@ -0,0 +1,50 @@
> +What: /sys/class/bdi/<bdi>/
> +Date: January 2008
> +Contact: Peter Zijlstra <[email protected]>
> +Description:
> +
> +Provide a place in sysfs for the backing_dev_info object.
> +This allows us to see and set the various BDI specific variables.
> +
> +The <bdi> identifyer can take the following forms:

"identifier"

> +blk-NAME
> +
> + Block devices, NAME is 'sda', 'loop0', etc...

But if I've done `mknod /dev/pizza-party 8 0', I'm looking for
blk-pizza-party, not blk-sda.

But I might still have /dev/sda, too.

> +FSTYPE-MAJOR:MINOR
> +
> + Non-block device backed filesystems which provide their own
> + BDI, such as NFS and FUSE. MAJOR:MINOR is the value of st_dev
> + for files on this filesystem.
> +
> +default
> +
> + The default backing dev, used for non-block device backed
> + filesystems which do not provide their own BDI.
> +
> +Files under /sys/class/bdi/<bdi>/
> +---------------------------------
> +
> +read_ahead_kb (read-write)
> +
> + Size of the read-ahead window in kilobytes
> +
> +reclaimable_kb (read-only)
> +
> + Reclaimable (dirty or unstable) memory destined for writeback
> + to this device
> +
> +writeback_kb (read-only)
> +
> + Memory currently under writeback to this device
> +
> +dirty_kb (read-only)
> +
> + Global threshold for reclaimable + writeback memory
> +
> +bdi_dirty_kb (read-only)
> +
> + Current threshold on this BDI for reclaimable + writeback
> + memory
> +

I dunno. A number of the things which you're exposing are closely tied to
present-day kernel implementation and may be irrelevant or even
unimplementable in a few years' time.

At the very least you should put a HUGE warning in here telling everyone
that these files may disappear or be renamed with new semantics in the
future, and that they should design their userspace code with this in mind.

But that will only prevent userspace from outright crashing. Once we
expose functionality of this nature, people will come to depend upon it.
We can't stop this.

Suppose $CLUELESS_CORP modifies $LARGE_DATABASE so that it uses these new
fields to optimise its cache population and cache flushout strategies.
Later, we are forced to remove these fields. The database now runs all
slowly.

It's just a bad idea to expose deep kernelguts in this way. We need really
good reasons for doing so, and those reasons should be in the changelog.

2008-01-31 09:39:23

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [patch 2/6] mm: bdi: export BDI attributes in sysfs

> On Tue, 29 Jan 2008 16:49:02 +0100
> Miklos Szeredi <[email protected]> wrote:
>
> > From: Peter Zijlstra <[email protected]>
> >
> > Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
> > object. This allows us to see and set the various BDI specific
> > variables.
> >
> > In particular this properly exposes the read-ahead window for all
> > relevant users and /sys/block/<block>/queue/read_ahead_kb should be
> > deprecated.
>
> This description is not complete. It implies that the readahead window is
> not "properly" exposed for some "relevant" users. The reader is left
> wondering what on earth this is referring to. I certainly don't know.
> Perhaps when this information is revealed, we can work out what was
> wrong with per-queue readahead tuning.

I think Peter meant, that the readahead window was only exposed for
block devices, and not things like NFS or FUSE.

> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > +++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-01-29 13:02:46.000000000 +0100
> > @@ -0,0 +1,50 @@
> > +What: /sys/class/bdi/<bdi>/
> > +Date: January 2008
> > +Contact: Peter Zijlstra <[email protected]>
> > +Description:
> > +
> > +Provide a place in sysfs for the backing_dev_info object.
> > +This allows us to see and set the various BDI specific variables.
> > +
> > +The <bdi> identifyer can take the following forms:
>
> "identifier"

Arrgh. Must run spellchecker on doc files :)

> > +blk-NAME
> > +
> > + Block devices, NAME is 'sda', 'loop0', etc...
>
> But if I've done `mknod /dev/pizza-party 8 0', I'm looking for
> blk-pizza-party, not blk-sda.
>
> But I might still have /dev/sda, too.

An alternative would be to uniformly use MAJOR:MINOR in there. It
would work for block devices and anonymous devices (NFS/FUSE) as well.

Would that be any better?

>
> > +FSTYPE-MAJOR:MINOR
> > +
> > + Non-block device backed filesystems which provide their own
> > + BDI, such as NFS and FUSE. MAJOR:MINOR is the value of st_dev
> > + for files on this filesystem.
> > +
> > +default
> > +
> > + The default backing dev, used for non-block device backed
> > + filesystems which do not provide their own BDI.
> > +
> > +Files under /sys/class/bdi/<bdi>/
> > +---------------------------------
> > +
> > +read_ahead_kb (read-write)
> > +
> > + Size of the read-ahead window in kilobytes
> > +
> > +reclaimable_kb (read-only)
> > +
> > + Reclaimable (dirty or unstable) memory destined for writeback
> > + to this device
> > +
> > +writeback_kb (read-only)
> > +
> > + Memory currently under writeback to this device
> > +
> > +dirty_kb (read-only)
> > +
> > + Global threshold for reclaimable + writeback memory
> > +
> > +bdi_dirty_kb (read-only)
> > +
> > + Current threshold on this BDI for reclaimable + writeback
> > + memory
> > +
>
> I dunno. A number of the things which you're exposing are closely tied to
> present-day kernel implementation and may be irrelevant or even
> unimplementable in a few years' time.

Which ones? They could possibly be moved to debugfs, or something.

I agree, that sysfs should be relatively stable.

Thanks,
Miklos

2008-01-31 09:56:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 2/6] mm: bdi: export BDI attributes in sysfs

On Thu, 31 Jan 2008 10:39:02 +0100 Miklos Szeredi <[email protected]> wrote:

> > On Tue, 29 Jan 2008 16:49:02 +0100
> > Miklos Szeredi <[email protected]> wrote:
> >
> > > From: Peter Zijlstra <[email protected]>
> > >
> > > Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
> > > object. This allows us to see and set the various BDI specific
> > > variables.
> > >
> > > In particular this properly exposes the read-ahead window for all
> > > relevant users and /sys/block/<block>/queue/read_ahead_kb should be
> > > deprecated.
> >
> > This description is not complete. It implies that the readahead window is
> > not "properly" exposed for some "relevant" users. The reader is left
> > wondering what on earth this is referring to. I certainly don't know.
> > Perhaps when this information is revealed, we can work out what was
> > wrong with per-queue readahead tuning.
>
> I think Peter meant, that the readahead window was only exposed for
> block devices, and not things like NFS or FUSE.

OK.

>
> > > +blk-NAME
> > > +
> > > + Block devices, NAME is 'sda', 'loop0', etc...
> >
> > But if I've done `mknod /dev/pizza-party 8 0', I'm looking for
> > blk-pizza-party, not blk-sda.
> >
> > But I might still have /dev/sda, too.
>
> An alternative would be to uniformly use MAJOR:MINOR in there. It
> would work for block devices and anonymous devices (NFS/FUSE) as well.
>
> Would that be any better?

I suppose so. sysfs likes to use symlinks to point over at related
things in different directories...

> >
> > > +FSTYPE-MAJOR:MINOR
> > > +
> > > + Non-block device backed filesystems which provide their own
> > > + BDI, such as NFS and FUSE. MAJOR:MINOR is the value of st_dev
> > > + for files on this filesystem.
> > > +
> > > +default
> > > +
> > > + The default backing dev, used for non-block device backed
> > > + filesystems which do not provide their own BDI.
> > > +
> > > +Files under /sys/class/bdi/<bdi>/
> > > +---------------------------------
> > > +
> > > +read_ahead_kb (read-write)
> > > +
> > > + Size of the read-ahead window in kilobytes
> > > +
> > > +reclaimable_kb (read-only)
> > > +
> > > + Reclaimable (dirty or unstable) memory destined for writeback
> > > + to this device
> > > +
> > > +writeback_kb (read-only)
> > > +
> > > + Memory currently under writeback to this device
> > > +
> > > +dirty_kb (read-only)
> > > +
> > > + Global threshold for reclaimable + writeback memory
> > > +
> > > +bdi_dirty_kb (read-only)
> > > +
> > > + Current threshold on this BDI for reclaimable + writeback
> > > + memory
> > > +
> >
> > I dunno. A number of the things which you're exposing are closely tied to
> > present-day kernel implementation and may be irrelevant or even
> > unimplementable in a few years' time.
>
> Which ones?

I don't know - I misplaced my copy of linux-2.6.44 :)

The whole concept of a BDI might go away, who knows? Progress in
non-volatile semiconductor storage might make the whole
rotating-platter-with-a-seek-head thing obsolete.

read_ahead_kb is likely to be stable. writeback_kb is a stable concept
too, although we might lose the ability to keep track of it some time in
the future.

Suppose that /dev/sda and /dev/sdb share the same queue - we lose the ability
to track some of these things?

> They could possibly be moved to debugfs, or something.
>
> I agree, that sysfs should be relatively stable.

This does look more like a debugging feature than a permanently-offered,
support-it-forever part of the kernel ABI.

2008-01-31 10:09:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 2/6] mm: bdi: export BDI attributes in sysfs


On Thu, 2008-01-31 at 01:54 -0800, Andrew Morton wrote:
> On Thu, 31 Jan 2008 10:39:02 +0100 Miklos Szeredi <[email protected]> wrote:
>
> > > On Tue, 29 Jan 2008 16:49:02 +0100
> > > Miklos Szeredi <[email protected]> wrote:
> > >
> > > > From: Peter Zijlstra <[email protected]>
> > > >
> > > > Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
> > > > object. This allows us to see and set the various BDI specific
> > > > variables.
> > > >
> > > > In particular this properly exposes the read-ahead window for all
> > > > relevant users and /sys/block/<block>/queue/read_ahead_kb should be
> > > > deprecated.
> > >
> > > This description is not complete. It implies that the readahead window is
> > > not "properly" exposed for some "relevant" users. The reader is left
> > > wondering what on earth this is referring to. I certainly don't know.
> > > Perhaps when this information is revealed, we can work out what was
> > > wrong with per-queue readahead tuning.
> >
> > I think Peter meant, that the readahead window was only exposed for
> > block devices, and not things like NFS or FUSE.
>
> OK.

And queue-less block devices like loop-back md/dm and whatnot.

> >
> > > > +blk-NAME
> > > > +
> > > > + Block devices, NAME is 'sda', 'loop0', etc...
> > >
> > > But if I've done `mknod /dev/pizza-party 8 0', I'm looking for
> > > blk-pizza-party, not blk-sda.
> > >
> > > But I might still have /dev/sda, too.
> >
> > An alternative would be to uniformly use MAJOR:MINOR in there. It
> > would work for block devices and anonymous devices (NFS/FUSE) as well.
> >
> > Would that be any better?
>
> I suppose so. sysfs likes to use symlinks to point over at related
> things in different directories...

Yeah, I think that would work best. Its more consistent as well.

> > >
> > > > +FSTYPE-MAJOR:MINOR
> > > > +
> > > > + Non-block device backed filesystems which provide their own
> > > > + BDI, such as NFS and FUSE. MAJOR:MINOR is the value of st_dev
> > > > + for files on this filesystem.
> > > > +
> > > > +default
> > > > +
> > > > + The default backing dev, used for non-block device backed
> > > > + filesystems which do not provide their own BDI.
> > > > +
> > > > +Files under /sys/class/bdi/<bdi>/
> > > > +---------------------------------
> > > > +
> > > > +read_ahead_kb (read-write)
> > > > +
> > > > + Size of the read-ahead window in kilobytes
> > > > +
> > > > +reclaimable_kb (read-only)
> > > > +
> > > > + Reclaimable (dirty or unstable) memory destined for writeback
> > > > + to this device
> > > > +
> > > > +writeback_kb (read-only)
> > > > +
> > > > + Memory currently under writeback to this device
> > > > +
> > > > +dirty_kb (read-only)
> > > > +
> > > > + Global threshold for reclaimable + writeback memory
> > > > +
> > > > +bdi_dirty_kb (read-only)
> > > > +
> > > > + Current threshold on this BDI for reclaimable + writeback
> > > > + memory
> > > > +
> > >
> > > I dunno. A number of the things which you're exposing are closely tied to
> > > present-day kernel implementation and may be irrelevant or even
> > > unimplementable in a few years' time.
> >
> > Which ones?
>
> I don't know - I misplaced my copy of linux-2.6.44 :)
>
> The whole concept of a BDI might go away, who knows? Progress in
> non-volatile semiconductor storage might make the whole
> rotating-platter-with-a-seek-head thing obsolete.
>
> read_ahead_kb is likely to be stable. writeback_kb is a stable concept
> too, although we might lose the ability to keep track of it some time in
> the future.
>
> Suppose that /dev/sda and /dev/sdb share the same queue - we lose the ability
> to track some of these things?
>
> > They could possibly be moved to debugfs, or something.
> >
> > I agree, that sysfs should be relatively stable.
>
> This does look more like a debugging feature than a permanently-offered,
> support-it-forever part of the kernel ABI.

Agreed, all except the read_ahead tunable are debugish. The min/max
things are real tunables though. (writing up a little text on the
why/how of those as we speak - well, write)

2008-02-29 11:26:58

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 2/6] mm: bdi: export BDI attributes in sysfs

On Tue, 29 Jan 2008 16:49:02 +0100 Miklos Szeredi <[email protected]> wrote:

> From: Peter Zijlstra <[email protected]>
>
> Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
> object. This allows us to see and set the various BDI specific
> variables.
>
> In particular this properly exposes the read-ahead window for all
> relevant users and /sys/block/<block>/queue/read_ahead_kb should be
> deprecated.
>
> With patient help from Kay Sievers and Greg KH
>
> [[email protected]]
>
> - split off NFS and FUSE changes into separate patches
> - document new sysfs attributes under Documentation/ABI
> - do bdi_class_init as a core_initcall, otherwise the "default" BDI
> won't be initialized
> - remove bdi_init_fmt macro, it's not used very much

please always provide diffstats.

Documentation/ABI/testing/sysfs-class-bdi | 50 +++++++++++++
block/genhd.c | 3
include/linux/backing-dev.h | 8 ++
include/linux/writeback.h | 3
lib/percpu_counter.c | 1
mm/backing-dev.c | 108 ++++++++++++++++++++++++++++++
mm/page-writeback.c | 2
mm/readahead.c | 8 +-
8 files changed, 181 insertions(+), 2 deletions(-)

would you believe this breaks ia64 allmodconfig, in the usual place:

In file included from arch/ia64/ia32/sys_ia32.c:59:
arch/ia64/ia32/ia32priv.h:342:1: warning: "SET_PERSONALITY" redefined
In file included from include/linux/elf.h:7,
from include/linux/module.h:14,
from include/linux/device.h:21,
from include/linux/backing-dev.h:15,
from include/linux/nfs_fs_sb.h:5,
from include/linux/nfs_fs.h:50,
from arch/ia64/ia32/sys_ia32.c:35:
include/asm/elf.h:180:1: warning: this is the location of the previous definition


We keep on hitting stupid build errors in this area: ia64 and elf. It is
obviously quite fragile. It would be nice to fix it properly.


For now, the easy fix:

--- a/include/linux/backing-dev.h~mm-bdi-export-bdi-attributes-in-sysfs-ia64-fix
+++ a/include/linux/backing-dev.h
@@ -12,10 +12,10 @@
#include <linux/log2.h>
#include <linux/proportions.h>
#include <linux/kernel.h>
-#include <linux/device.h>
#include <asm/atomic.h>

struct page;
+struct device;

/*
* Bits in backing_dev_info.state