2014-11-13 22:09:33

by Tejun Heo

[permalink] [raw]
Subject: [PATCH vfs 1/2] lib: implement ptrset

Implement set of pointers. Pointers can be added, deleted and
iterated. It's currently implemented as a thin rbtree wrapper making
addition and removal O(log N). A drawback is that iteration isn't RCU
safe, which is okay for now. This will be used to remove
inode->i_devices.

Signed-off-by: Tejun Heo <[email protected]>
---
include/linux/ptrset.h | 74 +++++++++++++++++++++++++++
lib/Makefile | 2
lib/ptrset.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 206 insertions(+), 1 deletion(-)

--- /dev/null
+++ b/include/linux/ptrset.h
@@ -0,0 +1,74 @@
+/*
+ * include/linux/ptrset.h - set of pointers
+ *
+ * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
+ *
+ * A ptrset contains an unordered set of pointers. Pointers can be added,
+ * deleted and iterated. Addition and removal complexities are O(log N)
+ * where N is the total number of elements in the ptrset.
+ */
+
+#ifndef __PTRSET_H
+#define __PTRSET_H
+
+#include <linux/preempt.h>
+#include <linux/rbtree.h>
+
+struct ptrset {
+ struct rb_root root;
+};
+
+struct ptrset_elem {
+ void *ptr;
+ struct rb_node node;
+};
+
+struct ptrset_iter {
+ struct ptrset_elem *pos;
+ struct ptrset_elem *next;
+};
+
+#define PTRSET_INITIALIZER { .root = RB_ROOT, }
+#define DEFINE_PTRSET(set) struct ptrset set = PTRSET_INITIALIZER
+
+static inline void ptrset_init(struct ptrset *set)
+{
+ *set = (struct ptrset)PTRSET_INITIALIZER;
+}
+
+void ptrset_preload(gfp_t gfp);
+int __must_check ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp);
+int ptrset_del(void *ptr, struct ptrset *set);
+
+/**
+ * ptrset_preload_end - end preload section started with ptrset_preload()
+ *
+ * Each ptrset_preload() should be matched with an invocation of this
+ * function. See ptrset_preload() for details.
+ */
+static inline void ptrset_preload_end(void)
+{
+ preempt_enable();
+}
+
+/**
+ * ptrset_for_each - iterate pointers in a ptrset
+ * @ptr_cur: the cursor pointer variable (can be a pointer to any type)
+ * @set: the target ptrset
+ * @iter: pointer to struct ptrset_iter used to store iteration state
+ *
+ * Iterate @ptr_cur over all pointers in @set using @iter as the temp
+ * storage. The order of iteration is undefined and the iterated pointers
+ * may be deleted during iteration.
+ *
+ * The caller is responsible for synchronization. This is not RCU safe.
+ */
+#define ptrset_for_each(ptr_cur, set, iter) \
+ rbtree_postorder_for_each_entry_safe((iter)->pos, (iter)->next, \
+ &(set)->root, node) \
+ if (({ (ptr_cur) = (iter)->pos->ptr; \
+ false; })) \
+ ; \
+ else
+
+#endif /* __PTRSET_H */
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLA
endif

lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o dump_stack.o timerqueue.o\
+ rbtree.o radix-tree.o dump_stack.o timerqueue.o ptrset.o \
idr.o int_sqrt.o extable.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o ratelimit.o show_mem.o \
--- /dev/null
+++ b/lib/ptrset.c
@@ -0,0 +1,131 @@
+/*
+ * lib/ptrset.c - set of pointers
+ *
+ * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
+ */
+
+#include <linux/ptrset.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+
+static DEFINE_PER_CPU(struct ptrset_elem *, ptrset_preload_elem);
+
+/**
+ * ptrset_preload - preload for ptrset_add()
+ * @gfp: allocation mask to use for preloading
+ *
+ * Preload per-cpu ptrset_elem buffer for ptrset_add(). Can only be used
+ * from process context and each ptrset_preload() invocation should be
+ * matched with ptrset_preload_end(). Note that preemption is disabled
+ * while preloaded.
+ *
+ * The first ptrset_add() in the preloaded section can be treated as if it
+ * were invoked with @gfp used for preloading. This allows using more
+ * permissive allocation masks for ptrsets protected by spinlocks.
+ */
+void ptrset_preload(gfp_t gfp)
+{
+ /*
+ * Consuming preload buffer from non-process context breaks preload
+ * allocation guarantee. Disallow usage from those contexts.
+ */
+ WARN_ON_ONCE(in_interrupt());
+ might_sleep_if(gfp & __GFP_WAIT);
+
+ preempt_disable();
+
+ if (!__this_cpu_read(ptrset_preload_elem)) {
+ struct ptrset_elem *new;
+
+ preempt_enable();
+ new = kmalloc(sizeof(*new), gfp);
+ preempt_disable();
+
+ new = __this_cpu_xchg(ptrset_preload_elem, new);
+ kfree(new);
+ }
+}
+
+static __always_inline bool ptrset_find_slot(void *ptr, struct ptrset *set,
+ struct rb_node ***slotp,
+ struct rb_node **parentp)
+{
+ struct rb_node **slot = &set->root.rb_node, *parent = NULL;
+
+ while (*slot) {
+ struct ptrset_elem *elem =
+ container_of(*slot, struct ptrset_elem, node);
+
+ parent = *slot;
+ if (ptr < elem->ptr) {
+ slot = &(*slot)->rb_left;
+ } else if (ptr > elem->ptr) {
+ slot = &(*slot)->rb_right;
+ } else {
+ *slotp = slot;
+ return true;
+ }
+ }
+
+ *slotp = slot;
+ if (parentp)
+ *parentp = parent;
+ return false;
+}
+
+/**
+ * ptrset_add - add a pointer to a ptrset
+ * @ptr: the pointer to add
+ * @set: the target ptrset
+ * @gfp: the allocation mask to use
+ *
+ * Allocate a ptrset_elem using @gfp, init with @ptr and add to @set.
+ * Returns 0 on success, -ENOMEM if allocation failed and -EEXIST if @ptr
+ * already exists in @set.
+ *
+ * The caller is responsible for synchronization.
+ */
+int ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp)
+{
+ struct ptrset_elem *elem;
+ struct rb_node *parent, **slot;
+
+ elem = kmalloc(sizeof(*elem), gfp);
+ if (!elem && !in_interrupt()) /* see ptrset_preload() */
+ elem = this_cpu_xchg(ptrset_preload_elem, NULL);
+ if (!elem)
+ return -ENOMEM;
+ elem->ptr = ptr;
+
+ if (ptrset_find_slot(elem->ptr, set, &slot, &parent)) {
+ kfree(elem);
+ return -EEXIST;
+ }
+
+ rb_link_node(&elem->node, parent, slot);
+ rb_insert_color(&elem->node, &set->root);
+ return 0;
+}
+
+/**
+ * ptrset_del - delete a pointer from a ptrset
+ * @ptr: the pointer to delete
+ * @set: the target ptrset
+ *
+ * Delete @ptr from @set. Returns 0 on success, -ENOENT if @ptr is not in
+ * @set.
+ *
+ * The caller is responsible for synchronization.
+ */
+int ptrset_del(void *ptr, struct ptrset *set)
+{
+ struct rb_node **slot, *node;
+
+ if (!ptrset_find_slot(ptr, set, &slot, NULL))
+ return -ENOENT;
+
+ node = *slot;
+ rb_erase(node, &set->root);
+ kfree(container_of(node, struct ptrset_elem, node));
+ return 0;
+}


2014-11-13 22:11:45

by Tejun Heo

[permalink] [raw]
Subject: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

inode->i_devices is a list_head used to link device inodes to the
corresponding block_device or cdev. This patch makes block_device and
cdev usfe ptrset to keep track of the inodes instead of linking
inode->i_devices allowing removal of the field and thus reduction of
struct inode by two pointers.

The conversion is staright-forward. list_add() is replaced with
preloaded ptrset_add(), list_del_init() with ptrset_del(), and list
iteration with ptrset_for_each(). The only part which isn't direct
one-to-one mapping is the error handling after ptrset_add() failure.

The saved two pointers will be used by cgroup writback support.

Signed-off-by: Tejun Heo <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
---
fs/block_dev.c | 39 +++++++++++++++++++++++----------------
fs/char_dev.c | 25 +++++++++++++++----------
fs/inode.c | 1 -
include/linux/cdev.h | 4 ++--
include/linux/fs.h | 4 ++--
5 files changed, 42 insertions(+), 31 deletions(-)

--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -458,7 +458,7 @@ static void init_once(void *foo)

memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_inodes);
+ ptrset_init(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
@@ -470,7 +470,7 @@ static void init_once(void *foo)

static inline void __bd_forget(struct inode *inode)
{
- list_del_init(&inode->i_devices);
+ ptrset_del(inode, &inode->i_bdev->bd_inodes);
inode->i_bdev = NULL;
inode->i_mapping = &inode->i_data;
}
@@ -478,14 +478,15 @@ static inline void __bd_forget(struct in
static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
- struct list_head *p;
+ struct ptrset_iter iter;
+ struct inode *bd_inode;
+
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
spin_lock(&bdev_lock);
- while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
- __bd_forget(list_entry(p, struct inode, i_devices));
- }
+ ptrset_for_each(bd_inode, &bdev->bd_inodes, &iter)
+ __bd_forget(bd_inode);
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
}
@@ -634,20 +635,26 @@ static struct block_device *bd_acquire(s

bdev = bdget(inode->i_rdev);
if (bdev) {
+ ptrset_preload(GFP_KERNEL);
spin_lock(&bdev_lock);
if (!inode->i_bdev) {
- /*
- * We take an additional reference to bd_inode,
- * and it's released in clear_inode() of inode.
- * So, we can access it via ->i_mapping always
- * without igrab().
- */
- ihold(bdev->bd_inode);
- inode->i_bdev = bdev;
- inode->i_mapping = bdev->bd_inode->i_mapping;
- list_add(&inode->i_devices, &bdev->bd_inodes);
+ if (!ptrset_add(inode, &bdev->bd_inodes, GFP_NOWAIT)) {
+ /*
+ * We take an additional reference to bd_inode,
+ * and it's released in clear_inode() of inode.
+ * So, we can access it via ->i_mapping always
+ * without igrab().
+ */
+ ihold(bdev->bd_inode);
+ inode->i_bdev = bdev;
+ inode->i_mapping = bdev->bd_inode->i_mapping;
+ } else {
+ bdput(bdev);
+ bdev = NULL;
+ }
}
spin_unlock(&bdev_lock);
+ ptrset_preload_end();
}
return bdev;
}
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -383,16 +383,20 @@ static int chrdev_open(struct inode *ino
if (!kobj)
return -ENXIO;
new = container_of(kobj, struct cdev, kobj);
+ ptrset_preload(GFP_KERNEL);
spin_lock(&cdev_lock);
/* Check i_cdev again in case somebody beat us to it while
we dropped the lock. */
p = inode->i_cdev;
if (!p) {
- inode->i_cdev = p = new;
- list_add(&inode->i_devices, &p->list);
- new = NULL;
+ ret = ptrset_add(inode, &new->inodes, GFP_NOWAIT);
+ if (!ret) {
+ inode->i_cdev = p = new;
+ new = NULL;
+ }
} else if (!cdev_get(p))
ret = -ENXIO;
+ ptrset_preload_end();
} else if (!cdev_get(p))
ret = -ENXIO;
spin_unlock(&cdev_lock);
@@ -422,18 +426,19 @@ static int chrdev_open(struct inode *ino
void cd_forget(struct inode *inode)
{
spin_lock(&cdev_lock);
- list_del_init(&inode->i_devices);
+ ptrset_del(inode, &inode->i_cdev->inodes);
inode->i_cdev = NULL;
spin_unlock(&cdev_lock);
}

static void cdev_purge(struct cdev *cdev)
{
+ struct inode *inode;
+ struct ptrset_iter iter;
+
spin_lock(&cdev_lock);
- while (!list_empty(&cdev->list)) {
- struct inode *inode;
- inode = container_of(cdev->list.next, struct inode, i_devices);
- list_del_init(&inode->i_devices);
+ ptrset_for_each(inode, &cdev->inodes, &iter) {
+ ptrset_del(inode, &cdev->inodes);
inode->i_cdev = NULL;
}
spin_unlock(&cdev_lock);
@@ -543,7 +548,7 @@ struct cdev *cdev_alloc(void)
{
struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
if (p) {
- INIT_LIST_HEAD(&p->list);
+ ptrset_init(&p->inodes);
kobject_init(&p->kobj, &ktype_cdev_dynamic);
}
return p;
@@ -560,7 +565,7 @@ struct cdev *cdev_alloc(void)
void cdev_init(struct cdev *cdev, const struct file_operations *fops)
{
memset(cdev, 0, sizeof *cdev);
- INIT_LIST_HEAD(&cdev->list);
+ ptrset_init(&cdev->inodes);
kobject_init(&cdev->kobj, &ktype_cdev_default);
cdev->ops = fops;
}
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -366,7 +366,6 @@ void inode_init_once(struct inode *inode
{
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
- INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -3,7 +3,7 @@

#include <linux/kobject.h>
#include <linux/kdev_t.h>
-#include <linux/list.h>
+#include <linux/ptrset.h>

struct file_operations;
struct inode;
@@ -13,7 +13,7 @@ struct cdev {
struct kobject kobj;
struct module *owner;
const struct file_operations *ops;
- struct list_head list;
+ struct ptrset inodes;
dev_t dev;
unsigned int count;
};
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -28,6 +28,7 @@
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
+#include <linux/ptrset.h>
#include <linux/blk_types.h>

#include <asm/byteorder.h>
@@ -426,7 +427,7 @@ struct block_device {
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
struct mutex bd_mutex; /* open/close mutex */
- struct list_head bd_inodes;
+ struct ptrset bd_inodes;
void * bd_claiming;
void * bd_holder;
int bd_holders;
@@ -609,7 +610,6 @@ struct inode {
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
#endif
- struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;

2014-11-13 22:23:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On Thu, 13 Nov 2014 17:09:27 -0500 Tejun Heo <[email protected]> wrote:

> Implement set of pointers. Pointers can be added, deleted and
> iterated. It's currently implemented as a thin rbtree wrapper making
> addition and removal O(log N). A drawback is that iteration isn't RCU
> safe, which is okay for now. This will be used to remove
> inode->i_devices.
>

Confused.

>
> --- /dev/null
> +++ b/include/linux/ptrset.h
> @@ -0,0 +1,74 @@
> +/*
> + * include/linux/ptrset.h - set of pointers
> + *
> + * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
> + *
> + * A ptrset contains an unordered set of pointers. Pointers can be added,
> + * deleted and iterated. Addition and removal complexities are O(log N)
> + * where N is the total number of elements in the ptrset.
> + */
> +
> +#ifndef __PTRSET_H
> +#define __PTRSET_H
> +
> +#include <linux/preempt.h>
> +#include <linux/rbtree.h>
> +
> +struct ptrset {
> + struct rb_root root;
> +};
> +
> +struct ptrset_elem {
> + void *ptr;
> + struct rb_node node;
> +};
> +
> +struct ptrset_iter {
> + struct ptrset_elem *pos;
> + struct ptrset_elem *next;
> +};

This seems rather slow and bloaty. Why not

struct tjpointer {
struct list_head list;
void *pointer;
};

And then callers do things like

struct tjpointer *tjp;

lock();

for_each_tjpointer(tjp, &my_tjpointer_list) {
foo(tjp->ptr);
}

tjpointer_del(tjp);

unlock();

That's less storage, vastly less support code, insertion and removal
are O(1) and it doesn't need the ghastly preload thing.

2014-11-13 22:27:41

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

Hello, Andrew.

On Thu, Nov 13, 2014 at 02:23:33PM -0800, Andrew Morton wrote:
> This seems rather slow and bloaty. Why not
>
> struct tjpointer {
> struct list_head list;
> void *pointer;
> };
>
> And then callers do things like
>
> struct tjpointer *tjp;
>
> lock();
>
> for_each_tjpointer(tjp, &my_tjpointer_list) {
> foo(tjp->ptr);
> }
>
> tjpointer_del(tjp);
>
> unlock();
>
> That's less storage, vastly less support code, insertion and removal
> are O(1) and it doesn't need the ghastly preload thing.

The goal is moving the memory necessary for indexing to the indexer
instead of the indexees. In the above case, the indexee would have to
either embed tjpointer inside it or at least have a pointer pointing
at it. With ptrset, all necessary memory areas are allocated on the
ptrset side. This is used to remove inode->i_devices list_head which
is currently occupying two pointers on all inodes while being used
only for block and char dev inodes in the cold paths.

Thanks.

--
tejun

2014-11-13 22:40:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On Thu, 13 Nov 2014 17:27:36 -0500 Tejun Heo <[email protected]> wrote:

> Hello, Andrew.
>
> On Thu, Nov 13, 2014 at 02:23:33PM -0800, Andrew Morton wrote:
> > This seems rather slow and bloaty. Why not
> >
> > struct tjpointer {
> > struct list_head list;
> > void *pointer;
> > };
> >
> > And then callers do things like
> >
> > struct tjpointer *tjp;
> >
> > lock();
> >
> > for_each_tjpointer(tjp, &my_tjpointer_list) {
> > foo(tjp->ptr);
> > }
> >
> > tjpointer_del(tjp);
> >
> > unlock();
> >
> > That's less storage, vastly less support code, insertion and removal
> > are O(1) and it doesn't need the ghastly preload thing.
>
> The goal is moving the memory necessary for indexing to the indexer
> instead of the indexees. In the above case, the indexee would have to
> either embed tjpointer inside it or at least have a pointer pointing
> at it.

In that case tjpointer_add() would need to do a kmalloc() for each inode
which is added to the bdev/cdev, just as ptrset_add() is doing.

That might require a nasty preload thing. But really, for just two
known callers it would be better to require the caller to create the
storage.


struct tjpointer *new_tpj;

new_tpj = kmalloc(...);
lock();
tjpointer_add(&my_tjp_list, new_tjp, my_pointer);
unlock();

Basically what I'm saying is nuke the rbtree and use lists.

2014-11-14 13:12:08

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

Hello, Andrew.

On Thu, Nov 13, 2014 at 02:40:41PM -0800, Andrew Morton wrote:
> In that case tjpointer_add() would need to do a kmalloc() for each inode
> which is added to the bdev/cdev, just as ptrset_add() is doing.
>
> That might require a nasty preload thing. But really, for just two
> known callers it would be better to require the caller to create the
> storage.
>
>
> struct tjpointer *new_tpj;
>
> new_tpj = kmalloc(...);
> lock();
> tjpointer_add(&my_tjp_list, new_tjp, my_pointer);
> unlock();
>
> Basically what I'm saying is nuke the rbtree and use lists.

Hah? Then, each removal would be O(N) where N is the number of total
block devices and there are cases where massive number of block
devices exist and many are added / removed back-to-back. I don't
think making those operations O(N^2) is a good idea.

Thanks.

--
tejun

2014-11-18 09:15:54

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On 11/14/2014 06:09 AM, Tejun Heo wrote:
> Implement set of pointers. Pointers can be added, deleted and
> iterated. It's currently implemented as a thin rbtree wrapper making
> addition and removal O(log N). A drawback is that iteration isn't RCU
> safe, which is okay for now. This will be used to remove
> inode->i_devices.
>
> Signed-off-by: Tejun Heo <[email protected]>
> ---
> include/linux/ptrset.h | 74 +++++++++++++++++++++++++++
> lib/Makefile | 2
> lib/ptrset.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 206 insertions(+), 1 deletion(-)
>
> --- /dev/null
> +++ b/include/linux/ptrset.h
> @@ -0,0 +1,74 @@
> +/*
> + * include/linux/ptrset.h - set of pointers
> + *
> + * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
> + *
> + * A ptrset contains an unordered set of pointers. Pointers can be added,
> + * deleted and iterated. Addition and removal complexities are O(log N)
> + * where N is the total number of elements in the ptrset.
> + */
> +
> +#ifndef __PTRSET_H
> +#define __PTRSET_H
> +
> +#include <linux/preempt.h>
> +#include <linux/rbtree.h>
> +
> +struct ptrset {
> + struct rb_root root;
> +};
> +
> +struct ptrset_elem {
> + void *ptr;
> + struct rb_node node;
> +};
> +
> +struct ptrset_iter {
> + struct ptrset_elem *pos;
> + struct ptrset_elem *next;
> +};
> +
> +#define PTRSET_INITIALIZER { .root = RB_ROOT, }
> +#define DEFINE_PTRSET(set) struct ptrset set = PTRSET_INITIALIZER
> +
> +static inline void ptrset_init(struct ptrset *set)
> +{
> + *set = (struct ptrset)PTRSET_INITIALIZER;
> +}
> +
> +void ptrset_preload(gfp_t gfp);
> +int __must_check ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp);
> +int ptrset_del(void *ptr, struct ptrset *set);
> +
> +/**
> + * ptrset_preload_end - end preload section started with ptrset_preload()
> + *
> + * Each ptrset_preload() should be matched with an invocation of this
> + * function. See ptrset_preload() for details.
> + */
> +static inline void ptrset_preload_end(void)
> +{
> + preempt_enable();
> +}
> +
> +/**
> + * ptrset_for_each - iterate pointers in a ptrset
> + * @ptr_cur: the cursor pointer variable (can be a pointer to any type)
> + * @set: the target ptrset
> + * @iter: pointer to struct ptrset_iter used to store iteration state
> + *
> + * Iterate @ptr_cur over all pointers in @set using @iter as the temp
> + * storage. The order of iteration is undefined and the iterated pointers
> + * may be deleted during iteration.
> + *
> + * The caller is responsible for synchronization. This is not RCU safe.
> + */
> +#define ptrset_for_each(ptr_cur, set, iter) \
> + rbtree_postorder_for_each_entry_safe((iter)->pos, (iter)->next, \
> + &(set)->root, node) \
> + if (({ (ptr_cur) = (iter)->pos->ptr; \
> + false; })) \
> + ; \
> + else
> +
> +#endif /* __PTRSET_H */
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLA
> endif
>
> lib-y := ctype.o string.o vsprintf.o cmdline.o \
> - rbtree.o radix-tree.o dump_stack.o timerqueue.o\
> + rbtree.o radix-tree.o dump_stack.o timerqueue.o ptrset.o \
> idr.o int_sqrt.o extable.o \
> sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
> proportions.o flex_proportions.o ratelimit.o show_mem.o \
> --- /dev/null
> +++ b/lib/ptrset.c
> @@ -0,0 +1,131 @@
> +/*
> + * lib/ptrset.c - set of pointers
> + *
> + * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
> + */
> +
> +#include <linux/ptrset.h>
> +#include <linux/slab.h>
> +#include <linux/percpu.h>
> +
> +static DEFINE_PER_CPU(struct ptrset_elem *, ptrset_preload_elem);
> +
> +/**
> + * ptrset_preload - preload for ptrset_add()
> + * @gfp: allocation mask to use for preloading
> + *
> + * Preload per-cpu ptrset_elem buffer for ptrset_add(). Can only be used
> + * from process context and each ptrset_preload() invocation should be
> + * matched with ptrset_preload_end(). Note that preemption is disabled
> + * while preloaded.
> + *
> + * The first ptrset_add() in the preloaded section can be treated as if it
> + * were invoked with @gfp used for preloading. This allows using more
> + * permissive allocation masks for ptrsets protected by spinlocks.
> + */
> +void ptrset_preload(gfp_t gfp)
> +{
> + /*
> + * Consuming preload buffer from non-process context breaks preload
> + * allocation guarantee. Disallow usage from those contexts.
> + */
> + WARN_ON_ONCE(in_interrupt());
> + might_sleep_if(gfp & __GFP_WAIT);
> +
> + preempt_disable();
> +
> + if (!__this_cpu_read(ptrset_preload_elem)) {
> + struct ptrset_elem *new;
> +
> + preempt_enable();
> + new = kmalloc(sizeof(*new), gfp);
> + preempt_disable();
> +
> + new = __this_cpu_xchg(ptrset_preload_elem, new);
> + kfree(new);
> + }
> +}

Is it too ugly?
What will be the most important result it achieve?

2014-11-18 11:55:36

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

Hello,

On Tue, Nov 18, 2014 at 05:19:18PM +0800, Lai Jiangshan wrote:
> Is it too ugly?

What is "it"? The whole thing? percpu preloading? I'm just gonna
continue assuming that you're talking about preloading. If you think
it's ugly, please go ahead and explain why you think it is.

It's almost impossible to respond to your "review". It's not clear
what your subject matter or opinion on it is. Might as well just bang
on the keyboard randomly. When reviewing (or communicating in
general), please try to properly form and elaborate your points.
Other people can't know what's going on in your brain and have to
speculate what you could have meant.

This implementation of preloading an evolution of a design pattern
which, IIRC, first started with the radix tree. The non-failing
aspect was introduced while the pattern was being applied to idr. I
think it's one of the better ways to implement preloading.

> What will be the most important result it achieve?

This is the same as other preloading. It allows pulling allocation
out of critical section so that it can be done with more generous
allocation mask (ie. GFP_KERNEL instead of GPF_NOWAIT). It's a common
pattern found in data structures which may allocate memory internally
such as radix tree or idr.

Thanks.

--
tejun

2014-11-18 12:10:57

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

On 11/14/2014 12:11 AM, Tejun Heo wrote:
<>
> @@ -426,7 +427,7 @@ struct block_device {
> struct inode * bd_inode; /* will die */
> struct super_block * bd_super;
> struct mutex bd_mutex; /* open/close mutex */
> - struct list_head bd_inodes;
> + struct ptrset bd_inodes;
> void * bd_claiming;
> void * bd_holder;
> int bd_holders;
> @@ -609,7 +610,6 @@ struct inode {
> #ifdef CONFIG_QUOTA
> struct dquot *i_dquot[MAXQUOTAS];
> #endif
> - struct list_head i_devices;
> union {
> struct pipe_inode_info *i_pipe;
> struct block_device *i_bdev;

You do know that you can completely dodge the bullet
by making i_devices a union with some other members.

If I understand correctly these are the init_special_inode()
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;

Right? well if so than those special inodes do
not hold any data and never do IO of any kind
so all the inode members that are needed for IO
are candidates.
example: i_wb_list; i_lru; i_dio_count i_writecount
i_dquot (when QUOTA is on) i_private and more

Even union with the "cgroup writback support" you
want to add.

Hey interesting per-cpu code though I learned something ...
Thanks
Boaz

2014-11-18 12:30:32

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

Hey, Boaz.

On Tue, Nov 18, 2014 at 02:10:50PM +0200, Boaz Harrosh wrote:
> Right? well if so than those special inodes do
> not hold any data and never do IO of any kind
> so all the inode members that are needed for IO
> are candidates.

It may not carry dirty pages but the inode itself still can get
dirtied tho.

> example: i_wb_list; i_lru; i_dio_count i_writecount

So, neither i_wb_list or i_lru can be used. It could be that some of
the atomic counters can be used but that requires collecting four such
counters consecutively.

> i_dquot (when QUOTA is on) i_private and more

If quota is off? i_private maybe but it's not big enough.

> Even union with the "cgroup writback support" you
> want to add.

Again, these inodes can get dirtied.

I think unions are okay when lifetime rules clearly separate how the
field is used or the usages are contained in a logical unit but
overloading random fields which may be used across lifetime in a data
structure which is as widely used and abused as inode is likely to
lead to later headaches.

This is really something special and local {block|char}_dev are doing
which doens't have to interfere with anything else. I think it's a
better approach to confine it to {block|char}_dev in the long term
even if that means carrying a bit more code.

Thanks.

--
tejun

2014-11-18 16:04:41

by Azat Khuzhin

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On Thu, Nov 13, 2014 at 05:09:27PM -0500, Tejun Heo wrote:
> +int ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp)
> +{
> + struct ptrset_elem *elem;
> + struct rb_node *parent, **slot;
> +
> + elem = kmalloc(sizeof(*elem), gfp);
> + if (!elem && !in_interrupt()) /* see ptrset_preload() */
> + elem = this_cpu_xchg(ptrset_preload_elem, NULL);
> + if (!elem)
> + return -ENOMEM;
> + elem->ptr = ptr;
> +
> + if (ptrset_find_slot(elem->ptr, set, &slot, &parent)) {
> + kfree(elem);
> + return -EEXIST;
> + }

Maybe allocation *after* ptrset_find_slot() will be better?
This will avoid extra kmalloc/kfree if such ptr already exist (and also
will avoid ENOMEM for such cases).

2014-11-18 17:16:35

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

Hello,

On Tue, Nov 18, 2014 at 06:56:29PM +0300, Azat Khuzhin wrote:
> On Thu, Nov 13, 2014 at 05:09:27PM -0500, Tejun Heo wrote:
> > +int ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp)
> > +{
> > + struct ptrset_elem *elem;
> > + struct rb_node *parent, **slot;
> > +
> > + elem = kmalloc(sizeof(*elem), gfp);
> > + if (!elem && !in_interrupt()) /* see ptrset_preload() */
> > + elem = this_cpu_xchg(ptrset_preload_elem, NULL);
> > + if (!elem)
> > + return -ENOMEM;
> > + elem->ptr = ptr;
> > +
> > + if (ptrset_find_slot(elem->ptr, set, &slot, &parent)) {
> > + kfree(elem);
> > + return -EEXIST;
> > + }
>
> Maybe allocation *after* ptrset_find_slot() will be better?
> This will avoid extra kmalloc/kfree if such ptr already exist (and also
> will avoid ENOMEM for such cases).

Sure thing. Will post an updated version soon.

Thanks.

--
tejun

2014-11-18 17:49:52

by Tejun Heo

[permalink] [raw]
Subject: [PATCH vfs v2 1/2] lib: implement ptrset

Implement set of pointers. Pointers can be added, deleted and
iterated. It's currently implemented as a thin rbtree wrapper making
addition and removal O(log N). A drawback is that iteration isn't RCU
safe, which is okay for now. This will be used to remove
inode->i_devices.

v2: In ptrset_add(), ptrset_find_slot() moved before new elem
allocation to avoid unnecessary alloc/free and -ENOMEM reporting
on -EEXIST cases. Suggested by Azat Khuzhin.

Signed-off-by: Tejun Heo <[email protected]>
Cc: Azat Khuzhin <[email protected]>
---
include/linux/ptrset.h | 74 ++++++++++++++++++++++++++++
lib/Makefile | 2
lib/ptrset.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 204 insertions(+), 1 deletion(-)

--- /dev/null
+++ b/include/linux/ptrset.h
@@ -0,0 +1,74 @@
+/*
+ * include/linux/ptrset.h - set of pointers
+ *
+ * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
+ *
+ * A ptrset contains an unordered set of pointers. Pointers can be added,
+ * deleted and iterated. Addition and removal complexities are O(log N)
+ * where N is the total number of elements in the ptrset.
+ */
+
+#ifndef __PTRSET_H
+#define __PTRSET_H
+
+#include <linux/preempt.h>
+#include <linux/rbtree.h>
+
+struct ptrset {
+ struct rb_root root;
+};
+
+struct ptrset_elem {
+ void *ptr;
+ struct rb_node node;
+};
+
+struct ptrset_iter {
+ struct ptrset_elem *pos;
+ struct ptrset_elem *next;
+};
+
+#define PTRSET_INITIALIZER { .root = RB_ROOT, }
+#define DEFINE_PTRSET(set) struct ptrset set = PTRSET_INITIALIZER
+
+static inline void ptrset_init(struct ptrset *set)
+{
+ *set = (struct ptrset)PTRSET_INITIALIZER;
+}
+
+void ptrset_preload(gfp_t gfp);
+int __must_check ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp);
+int ptrset_del(void *ptr, struct ptrset *set);
+
+/**
+ * ptrset_preload_end - end preload section started with ptrset_preload()
+ *
+ * Each ptrset_preload() should be matched with an invocation of this
+ * function. See ptrset_preload() for details.
+ */
+static inline void ptrset_preload_end(void)
+{
+ preempt_enable();
+}
+
+/**
+ * ptrset_for_each - iterate pointers in a ptrset
+ * @ptr_cur: the cursor pointer variable (can be a pointer to any type)
+ * @set: the target ptrset
+ * @iter: pointer to struct ptrset_iter used to store iteration state
+ *
+ * Iterate @ptr_cur over all pointers in @set using @iter as the temp
+ * storage. The order of iteration is undefined and the iterated pointers
+ * may be deleted during iteration.
+ *
+ * The caller is responsible for synchronization. This is not RCU safe.
+ */
+#define ptrset_for_each(ptr_cur, set, iter) \
+ rbtree_postorder_for_each_entry_safe((iter)->pos, (iter)->next, \
+ &(set)->root, node) \
+ if (({ (ptr_cur) = (iter)->pos->ptr; \
+ false; })) \
+ ; \
+ else
+
+#endif /* __PTRSET_H */
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLA
endif

lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o dump_stack.o timerqueue.o\
+ rbtree.o radix-tree.o dump_stack.o timerqueue.o ptrset.o \
idr.o int_sqrt.o extable.o \
sha1.o md5.o irq_regs.o argv_split.o \
proportions.o flex_proportions.o ratelimit.o show_mem.o \
--- /dev/null
+++ b/lib/ptrset.c
@@ -0,0 +1,129 @@
+/*
+ * lib/ptrset.c - set of pointers
+ *
+ * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
+ */
+
+#include <linux/ptrset.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+
+static DEFINE_PER_CPU(struct ptrset_elem *, ptrset_preload_elem);
+
+/**
+ * ptrset_preload - preload for ptrset_add()
+ * @gfp: allocation mask to use for preloading
+ *
+ * Preload per-cpu ptrset_elem buffer for ptrset_add(). Can only be used
+ * from process context and each ptrset_preload() invocation should be
+ * matched with ptrset_preload_end(). Note that preemption is disabled
+ * while preloaded.
+ *
+ * The first ptrset_add() in the preloaded section can be treated as if it
+ * were invoked with @gfp used for preloading. This allows using more
+ * permissive allocation masks for ptrsets protected by spinlocks.
+ */
+void ptrset_preload(gfp_t gfp)
+{
+ /*
+ * Consuming preload buffer from non-process context breaks preload
+ * allocation guarantee. Disallow usage from those contexts.
+ */
+ WARN_ON_ONCE(in_interrupt());
+ might_sleep_if(gfp & __GFP_WAIT);
+
+ preempt_disable();
+
+ if (!__this_cpu_read(ptrset_preload_elem)) {
+ struct ptrset_elem *new;
+
+ preempt_enable();
+ new = kmalloc(sizeof(*new), gfp);
+ preempt_disable();
+
+ new = __this_cpu_xchg(ptrset_preload_elem, new);
+ kfree(new);
+ }
+}
+
+static __always_inline bool ptrset_find_slot(void *ptr, struct ptrset *set,
+ struct rb_node ***slotp,
+ struct rb_node **parentp)
+{
+ struct rb_node **slot = &set->root.rb_node, *parent = NULL;
+
+ while (*slot) {
+ struct ptrset_elem *elem =
+ container_of(*slot, struct ptrset_elem, node);
+
+ parent = *slot;
+ if (ptr < elem->ptr) {
+ slot = &(*slot)->rb_left;
+ } else if (ptr > elem->ptr) {
+ slot = &(*slot)->rb_right;
+ } else {
+ *slotp = slot;
+ return true;
+ }
+ }
+
+ *slotp = slot;
+ if (parentp)
+ *parentp = parent;
+ return false;
+}
+
+/**
+ * ptrset_add - add a pointer to a ptrset
+ * @ptr: the pointer to add
+ * @set: the target ptrset
+ * @gfp: the allocation mask to use
+ *
+ * Allocate a ptrset_elem using @gfp, init with @ptr and add to @set.
+ * Returns 0 on success, -ENOMEM if allocation failed and -EEXIST if @ptr
+ * already exists in @set.
+ *
+ * The caller is responsible for synchronization.
+ */
+int ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp)
+{
+ struct ptrset_elem *elem;
+ struct rb_node *parent, **slot;
+
+ if (ptrset_find_slot(ptr, set, &slot, &parent))
+ return -EEXIST;
+
+ elem = kmalloc(sizeof(*elem), gfp);
+ if (!elem && !in_interrupt()) /* see ptrset_preload() */
+ elem = this_cpu_xchg(ptrset_preload_elem, NULL);
+ if (!elem)
+ return -ENOMEM;
+ elem->ptr = ptr;
+
+ rb_link_node(&elem->node, parent, slot);
+ rb_insert_color(&elem->node, &set->root);
+ return 0;
+}
+
+/**
+ * ptrset_del - delete a pointer from a ptrset
+ * @ptr: the pointer to delete
+ * @set: the target ptrset
+ *
+ * Delete @ptr from @set. Returns 0 on success, -ENOENT if @ptr is not in
+ * @set.
+ *
+ * The caller is responsible for synchronization.
+ */
+int ptrset_del(void *ptr, struct ptrset *set)
+{
+ struct rb_node **slot, *node;
+
+ if (!ptrset_find_slot(ptr, set, &slot, NULL))
+ return -ENOENT;
+
+ node = *slot;
+ rb_erase(node, &set->root);
+ kfree(container_of(node, struct ptrset_elem, node));
+ return 0;
+}

2014-11-18 20:46:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On Fri, 14 Nov 2014 08:12:02 -0500 Tejun Heo <[email protected]> wrote:

> Hello, Andrew.
>
> On Thu, Nov 13, 2014 at 02:40:41PM -0800, Andrew Morton wrote:
> > In that case tjpointer_add() would need to do a kmalloc() for each inode
> > which is added to the bdev/cdev, just as ptrset_add() is doing.
> >
> > That might require a nasty preload thing. But really, for just two
> > known callers it would be better to require the caller to create the
> > storage.
> >
> >
> > struct tjpointer *new_tpj;
> >
> > new_tpj = kmalloc(...);
> > lock();
> > tjpointer_add(&my_tjp_list, new_tjp, my_pointer);
> > unlock();
> >
> > Basically what I'm saying is nuke the rbtree and use lists.
>
> Hah? Then, each removal would be O(N) where N is the number of total
> block devices and there are cases where massive number of block
> devices exist and many are added / removed back-to-back. I don't
> think making those operations O(N^2) is a good idea.
>

bdev_evict_inode() walks all the inodes attached to the bdev and
unlinks them from the bdev. That can be done with
list_for_each_safe(), just as it is (effectively) in current mainline.

IOW, all we need to do is to remove the list_head from struct inode and
create a new, separately allocated { struct list_head l; void *inode }
to point at the inode. IOW, simply convert the intrusive list to a
nonintrusive list.


This is proving a painful way of extracting a changelog :( Perhaps I'm
still not getting it and you should have another go, this time
explaining the reasoning behind the design choices.

2014-11-19 01:37:48

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On 11/18/2014 07:55 PM, Tejun Heo wrote:
> Hello,
>
> On Tue, Nov 18, 2014 at 05:19:18PM +0800, Lai Jiangshan wrote:
>> Is it too ugly?
>
> What is "it"? The whole thing? percpu preloading? I'm just gonna
> continue assuming that you're talking about preloading. If you think
> it's ugly, please go ahead and explain why you think it is.

Sorry.
"it" = "preload"

>
> It's almost impossible to respond to your "review". It's not clear
> what your subject matter or opinion on it is. Might as well just bang
> on the keyboard randomly. When reviewing (or communicating in
> general), please try to properly form and elaborate your points.
> Other people can't know what's going on in your brain and have to
> speculate what you could have meant.
>
> This implementation of preloading an evolution of a design pattern
> which, IIRC, first started with the radix tree. The non-failing
> aspect was introduced while the pattern was being applied to idr. I
> think it's one of the better ways to implement preloading.
>
>> What will be the most important result it achieve?
>
> This is the same as other preloading. It allows pulling allocation
> out of critical section so that it can be done with more generous
> allocation mask (ie. GFP_KERNEL instead of GPF_NOWAIT). It's a common
> pattern found in data structures which may allocate memory internally
> such as radix tree or idr.

To me, this does explain why it is ugly. The "preload" trick separates one operation
(the memory-allocation) into 3 steps(functions) and creates a special critical region
which is preempt-disabled, which is non-workable-when-nested, the later drawback also
means preload can't work in softirqs/irqs...

I can't argue for existing code. I accept the prelaod in radix tree and idr.
And they are important data structures. (I mean they achieve important result)
so tricks or some thing applied to them seems some kinds of acceptable.

Even in idr, idr_alloc() includes two operations, id-allocation (includes memroy-allocation)
and id-resource-association. These two operations can be separated into 2 functions
without any "preload". (this separation is different from the one in "preload",
this possible separation makes one function only do one thing,
"preload"-approach uses 3 functions together to do one thing or 2 things.)

Thanks.
Lai

2014-11-20 10:43:00

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

On 11/14/2014 12:11 AM, Tejun Heo wrote:
> inode->i_devices is a list_head used to link device inodes to the
> corresponding block_device or cdev. This patch makes block_device and
> cdev usfe ptrset to keep track of the inodes instead of linking
> inode->i_devices allowing removal of the field and thus reduction of
> struct inode by two pointers.
>
> The conversion is staright-forward. list_add() is replaced with
> preloaded ptrset_add(), list_del_init() with ptrset_del(), and list
> iteration with ptrset_for_each(). The only part which isn't direct
> one-to-one mapping is the error handling after ptrset_add() failure.
>
> The saved two pointers will be used by cgroup writback support.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: Alexander Viro <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Christoph Hellwig <[email protected]>
> ---
> fs/block_dev.c | 39 +++++++++++++++++++++++----------------
> fs/char_dev.c | 25 +++++++++++++++----------
> fs/inode.c | 1 -
> include/linux/cdev.h | 4 ++--
> include/linux/fs.h | 4 ++--
> 5 files changed, 42 insertions(+), 31 deletions(-)
>
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -458,7 +458,7 @@ static void init_once(void *foo)
>
> memset(bdev, 0, sizeof(*bdev));
> mutex_init(&bdev->bd_mutex);
> - INIT_LIST_HEAD(&bdev->bd_inodes);
> + ptrset_init(&bdev->bd_inodes);
> INIT_LIST_HEAD(&bdev->bd_list);
> #ifdef CONFIG_SYSFS
> INIT_LIST_HEAD(&bdev->bd_holder_disks);
> @@ -470,7 +470,7 @@ static void init_once(void *foo)
>
> static inline void __bd_forget(struct inode *inode)
> {
> - list_del_init(&inode->i_devices);
> + ptrset_del(inode, &inode->i_bdev->bd_inodes);
> inode->i_bdev = NULL;
> inode->i_mapping = &inode->i_data;
> }
> @@ -478,14 +478,15 @@ static inline void __bd_forget(struct in
> static void bdev_evict_inode(struct inode *inode)
> {
> struct block_device *bdev = &BDEV_I(inode)->bdev;
> - struct list_head *p;
> + struct ptrset_iter iter;
> + struct inode *bd_inode;
> +
> truncate_inode_pages_final(&inode->i_data);
> invalidate_inode_buffers(inode); /* is it needed here? */
> clear_inode(inode);
> spin_lock(&bdev_lock);
> - while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
> - __bd_forget(list_entry(p, struct inode, i_devices));
> - }
> + ptrset_for_each(bd_inode, &bdev->bd_inodes, &iter)
> + __bd_forget(bd_inode);
> list_del_init(&bdev->bd_list);
> spin_unlock(&bdev_lock);
> }
> @@ -634,20 +635,26 @@ static struct block_device *bd_acquire(s
>
> bdev = bdget(inode->i_rdev);
> if (bdev) {
> + ptrset_preload(GFP_KERNEL);

if I understand correctly the motivation here is that the allocation
of the internal element is done GFP_KERNEL at this call

Then the add() below can be under the spin_lock.

So why don't you just return an element here to caller and give it to
add below. No Preemption-disable, no percpu variable, simple. Like:
struct ptrset_elem *new = ptrset_preload(GFP_KERNEL);

then
if (!new)
just fail here just as you faild below with ptrset_add()
> spin_lock(&bdev_lock);

lock taken
> if (!inode->i_bdev) {
> - /*
> - * We take an additional reference to bd_inode,
> - * and it's released in clear_inode() of inode.
> - * So, we can access it via ->i_mapping always
> - * without igrab().
> - */
> - ihold(bdev->bd_inode);
> - inode->i_bdev = bdev;
> - inode->i_mapping = bdev->bd_inode->i_mapping;
> - list_add(&inode->i_devices, &bdev->bd_inodes);
> + if (!ptrset_add(inode, &bdev->bd_inodes, GFP_NOWAIT)) {

ptrset_add(inode, &bdev->bd_inodes, new);

Here ptrset_add cannot fail and can just be void return.
(If element exist then "new" is freed inside here. After add() "new" is owned
by the pset)

> + /*
> + * We take an additional reference to bd_inode,
> + * and it's released in clear_inode() of inode.
> + * So, we can access it via ->i_mapping always
> + * without igrab().
> + */
> + ihold(bdev->bd_inode);
> + inode->i_bdev = bdev;
> + inode->i_mapping = bdev->bd_inode->i_mapping;
> + } else {

This else is of the if(!new) above if need the spinlock then fine lock for this too.

> + bdput(bdev);
> + bdev = NULL;
> + }
> }
> spin_unlock(&bdev_lock);
> + ptrset_preload_end();

This one not needed anymore

> }
> return bdev;
> }
> --- a/fs/char_dev.c
> +++ b/fs/char_dev.c
> @@ -383,16 +383,20 @@ static int chrdev_open(struct inode *ino
> if (!kobj)
> return -ENXIO;
> new = container_of(kobj, struct cdev, kobj);
> + ptrset_preload(GFP_KERNEL);

Same exact thing here

> spin_lock(&cdev_lock);
> /* Check i_cdev again in case somebody beat us to it while
> we dropped the lock. */
> p = inode->i_cdev;
> if (!p) {
> - inode->i_cdev = p = new;
> - list_add(&inode->i_devices, &p->list);
> - new = NULL;
> + ret = ptrset_add(inode, &new->inodes, GFP_NOWAIT);
> + if (!ret) {
> + inode->i_cdev = p = new;
> + new = NULL;
> + }
> } else if (!cdev_get(p))
> ret = -ENXIO;
> + ptrset_preload_end();
> } else if (!cdev_get(p))
> ret = -ENXIO;
> spin_unlock(&cdev_lock);
<>

Am I totally missing something? It looks like you want to make sure you
allocate-with-wait an element before hand, if needed, usually you do, before
you take spin-locks. Is there some other reasons that I do not see?

Thanks
Boaz

2014-11-20 11:50:59

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

Hello, Boaz.

On Thu, Nov 20, 2014 at 12:42:53PM +0200, Boaz Harrosh wrote:
> if I understand correctly the motivation here is that the allocation
> of the internal element is done GFP_KERNEL at this call
>
> Then the add() below can be under the spin_lock.
>
> So why don't you just return an element here to caller and give it to
> add below. No Preemption-disable, no percpu variable, simple. Like:

Hmmm... mostly because preloading is more convenient and but also
because it provides better separation from internal implementation
details. e.g. This may be implemented using a different data
structure (e.g. bonsai tree) which may require differing number of new
elements even on success. With the scheme you're describing, the
operation would be constantly allocating and freeing memory areas
(which may be multiple) unnecessarily.

One thing which is debatable is how to handle preloading errors. We
can have the preload fail and then assume that the later insertion
won't fail with -ENOMEM (often through BUG/WARN_ON()); however, it
often, but not always, is that those insertion operations may fail
with different error codes too and requires error handling anyway, so
overall it seems better to defer the allocation error to the actual
insertion point. It also makes conceptual sense. The preloading
simply upgrades the allocation mask the insertion operation uses.

Thanks.

--
tejun

2014-11-20 12:36:58

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

On 11/20/2014 01:50 PM, Tejun Heo wrote:
> Hello, Boaz.
>
> On Thu, Nov 20, 2014 at 12:42:53PM +0200, Boaz Harrosh wrote:
>> if I understand correctly the motivation here is that the allocation
>> of the internal element is done GFP_KERNEL at this call
>>
>> Then the add() below can be under the spin_lock.
>>
>> So why don't you just return an element here to caller and give it to
>> add below. No Preemption-disable, no percpu variable, simple. Like:
>
> Hmmm... mostly because preloading is more convenient and but also
> because it provides better separation from internal implementation
> details. e.g. This may be implemented using a different data
> structure (e.g. bonsai tree)

Two things:
1. This can be easily hidden by returning a none defined type
which internals are only known to the implementation so
even if you change the implementation users need not change.
Like just a (void *) but it is better to be type-full
like:
struct pset_new;
struct pset_new *pset_preload()

And the internals of struct pset_new is only known to implementation
2. Obfuscation: Currently this is the proposed implementation if jugging
by the previous imp it is good for 15 years.
Else since when are we afraid to change two users?

> which may require differing number of new
> elements even on success. With the scheme you're describing, the
> operation would be constantly allocating and freeing memory areas
> (which may be multiple) unnecessarily.

Actually with my proposed change to "the code you submitted here"
there are *less* unnecessary allocations. In both our imp we have a
waste when element already exist in the tree, and your imp already
waists an allocation in every pset_preload()

And again you are talking about a future undefined "what if", let
us look at the very sound imp you proposed here with rbtree and
do the best we can with that one.

>
> One thing which is debatable is how to handle preloading errors. We
> can have the preload fail and then assume that the later insertion
> won't fail with -ENOMEM (often through BUG/WARN_ON()); however, it
> often, but not always, is that those insertion operations may fail
> with different error codes too and requires error handling anyway,

Again Theoretical. With your current code the only failing I see
from add() is allocation, so with my imp it will never fail. One
thing good with embedded list_heads is the void returning add.
And so with my proposition: void returning add.

When some new imp will be needed we can cross the bridge then.

For now you have convinced me that an rbtree is good, and I want to
solve the preemption-disable, none interrupt ugliness, per-cpu vars,
as well as the double alloc in the normal lots-of-free-memory case.

> so
> overall it seems better to defer the allocation error to the actual
> insertion point.

That one I did not understand.

> It also makes conceptual sense. The preloading
> simply upgrades the allocation mask the insertion operation uses.
>

How is "upgrades", better then "always have the best mask"

> Thanks.
>

Thanks
Boaz

2014-11-20 13:11:24

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

Hello, Boaz.

On Thu, Nov 20, 2014 at 02:36:53PM +0200, Boaz Harrosh wrote:
> Actually with my proposed change to "the code you submitted here"
> there are *less* unnecessary allocations. In both our imp we have a
> waste when element already exist in the tree, and your imp already
> waists an allocation in every pset_preload()
>
> And again you are talking about a future undefined "what if", let
> us look at the very sound imp you proposed here with rbtree and
> do the best we can with that one.

Preloading is an established pattern and a pretty effective one at
that. It allows its users to not worry about carrying the state
explicitly while allowing the preloaded allocation to be cached across
multiple insertions.

Whether ptrset needs it or not is debatable. If it didn't need to be
shared between block and chardevs, I prolly would have just open coded
it with linked list. But being a generic library feature which isn't
too different from other memory allocating indexing data structures, I
think it'd be better to follow the established pattern rather than
creating another one-off behavior. If we demote this to something
which is shared only between block and char devs, we can go simpler
and less versatile. Where should it go tho?

> Again Theoretical. With your current code the only failing I see
> from add() is allocation, so with my imp it will never fail. One
> thing good with embedded list_heads is the void returning add.
> And so with my proposition: void returning add.

Sure, if you look at just these two instances. We can just require
preloading and drop @gfp from the insertion function too. -EEXIST can
be avoided too, so let's drop the return value entirely.

Again, the problem is that it's implemented as a generic lib feature,
so it needs to consider how different use cases can be served and we
have an established pattern which proved to be effective over time for
these types of data structures and that's the pattern being followed
here.

> When some new imp will be needed we can cross the bridge then.
>
> For now you have convinced me that an rbtree is good, and I want to
> solve the preemption-disable, none interrupt ugliness, per-cpu vars,
> as well as the double alloc in the normal lots-of-free-memory case.

It's a reserve which is kept around, one per CPU at most, which also
allows us to avoid spurious frees on insertion attempts which don't
consume the reserve. If your point is that we don't need ptrset as a
generic lib thing, this can definitely be simpler but otherwise I'm
not sure what you're arguing. We have an established pattern to make
prealloactions easier for generic lib functions. Why wouldn't we use
that and make things simpler for its users?

> > so
> > overall it seems better to defer the allocation error to the actual
> > insertion point.
>
> That one I did not understand.

So, w/o preloading, you would do,

error = insert();
if (error)
handle error;

W/ preloading, one way to do it is,

if (preload())
handle -ENOMEM;
lock;
error = insert();
if (error)
handle error which can't be -ENOMEM;
unlock;
preload_end();

Another way is

preload(); // can't fail
lock;
error = insert();
if (error)
handle error;'
unlock;
preload_end();

Both ways have pros and cons. The latter seems to lead to simpler
code in general. Not always, but the overall.

> > It also makes conceptual sense. The preloading
> > simply upgrades the allocation mask the insertion operation uses.
>
> How is "upgrades", better then "always have the best mask"

Hah? What are you talking about? So,

preload(GFP_KERNEL);
lock;
insert(GFP_NOWAIT);
unlock;
preload_end();

can be considered as insert() allocating memory using GFP_KERNEL.
That's what preallocation does. My point was that deferring
allocation error to insert() time allows considering the above code
like the following.

lock;
insert(GFP_KERNEL);
unlock;

And that's why the pattern usually leads to simpler code - it doesn't
create a new failure point.

Thanks.

--
tejun

2014-11-20 13:39:58

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

On Thu, Nov 20, 2014 at 08:11:18AM -0500, Tejun Heo wrote:
...
> creating another one-off behavior. If we demote this to something
> which is shared only between block and char devs, we can go simpler
> and less versatile. Where should it go tho?

So, one option is implementing something minimal in fs/devptrs.h or
somesuch and include it from block/char_dev.c. If it isn't exposed as
a generic lib function, this can be a lot simpler. I'll give it a
shot later today.

Thanks.

--
tejun

2014-11-20 14:14:34

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

On 11/20/2014 03:11 PM, Tejun Heo wrote:
> Hello, Boaz.
>
<>
> W/ preloading, one way to do it is,
>
> if (preload())
> handle -ENOMEM;
> lock;
> error = insert();
> if (error)
> handle error which can't be -ENOMEM;
> unlock;
> preload_end();
>

I like this one, cause of the place I come from. Where
in a cluster you want the local fails as early as possible
before you start to commit remotely, and need to undo on
errors.

And I can see the real flow of things

> Another way is
>
> preload(); // can't fail
> lock;
> error = insert();
> if (error)
> handle error;'
> unlock;
> preload_end();
>
> Both ways have pros and cons. The latter seems to lead to simpler
> code in general. Not always, but the overall.
>

I still like the over all simplicity of the above pattern to
this behind the seen complexity hidden away under the carpet.

But I guess that is just me. That is your call sir.

I do see your point though.

<>
>
> And that's why the pattern usually leads to simpler code - it doesn't
> create a new failure point.
>

Again a matter of taste. I like the extra ENOMEM failure point before
I started to commit to any state changes, lock grabbing and unrolling
in case of errors.

But I see your points as well. For what it is worth I have reviewed
your code and did not find any faults in it. It looks like sound
code.

Thanks
Boaz

2014-11-20 14:19:47

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 2/2] {block|char}_dev: remove inode->i_devices

On Thu, Nov 20, 2014 at 04:14:29PM +0200, Boaz Harrosh wrote:
> On 11/20/2014 03:11 PM, Tejun Heo wrote:
> > Hello, Boaz.
> >
> <>
> > W/ preloading, one way to do it is,
> >
> > if (preload())
> > handle -ENOMEM;
> > lock;
> > error = insert();
> > if (error)
> > handle error which can't be -ENOMEM;
> > unlock;
> > preload_end();
> >
>
> I like this one, cause of the place I come from. Where
> in a cluster you want the local fails as early as possible
> before you start to commit remotely, and need to undo on
> errors.
>
> And I can see the real flow of things
>
> > Another way is
> >
> > preload(); // can't fail
> > lock;
> > error = insert();
> > if (error)
> > handle error;'
> > unlock;
> > preload_end();
> >
> > Both ways have pros and cons. The latter seems to lead to simpler
> > code in general. Not always, but the overall.
> >
>
> I still like the over all simplicity of the above pattern to
> this behind the seen complexity hidden away under the carpet.

Maybe the right thing to do is allowing both. It really depends on
the use cases. For a lot of cases, the error handling and unrolling
are necessary anyway so the deferred preload failure doesn't add any
complexity but there are cases where the insertion can be guaranteed
to succeed. The only change necessary is making preload return
-ENOMEM which can be ignored by the caller anyway. The caller can
either ignore and handle the deferred failure later or handle it and
know that later insertion won't fail with -ENOMEM.

Anyways, let's see how minimal linked list implementation works out.

Thanks.

--
tejun

2014-11-25 16:37:14

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

On Thu 13-11-14 17:09:27, Tejun Heo wrote:
> Implement set of pointers. Pointers can be added, deleted and
> iterated. It's currently implemented as a thin rbtree wrapper making
> addition and removal O(log N). A drawback is that iteration isn't RCU
> safe, which is okay for now. This will be used to remove
> inode->i_devices.
Tejun, I've noticed this somewhat late but I guess you haven't noticed my
patch series starting with a patch:
blockdev: Don't use i_devices inode field

We can get rid of i_devices without tricks like this... I just need to
improve how char devices are handled.

Honza
>
> Signed-off-by: Tejun Heo <[email protected]>
> ---
> include/linux/ptrset.h | 74 +++++++++++++++++++++++++++
> lib/Makefile | 2
> lib/ptrset.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 206 insertions(+), 1 deletion(-)
>
> --- /dev/null
> +++ b/include/linux/ptrset.h
> @@ -0,0 +1,74 @@
> +/*
> + * include/linux/ptrset.h - set of pointers
> + *
> + * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
> + *
> + * A ptrset contains an unordered set of pointers. Pointers can be added,
> + * deleted and iterated. Addition and removal complexities are O(log N)
> + * where N is the total number of elements in the ptrset.
> + */
> +
> +#ifndef __PTRSET_H
> +#define __PTRSET_H
> +
> +#include <linux/preempt.h>
> +#include <linux/rbtree.h>
> +
> +struct ptrset {
> + struct rb_root root;
> +};
> +
> +struct ptrset_elem {
> + void *ptr;
> + struct rb_node node;
> +};
> +
> +struct ptrset_iter {
> + struct ptrset_elem *pos;
> + struct ptrset_elem *next;
> +};
> +
> +#define PTRSET_INITIALIZER { .root = RB_ROOT, }
> +#define DEFINE_PTRSET(set) struct ptrset set = PTRSET_INITIALIZER
> +
> +static inline void ptrset_init(struct ptrset *set)
> +{
> + *set = (struct ptrset)PTRSET_INITIALIZER;
> +}
> +
> +void ptrset_preload(gfp_t gfp);
> +int __must_check ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp);
> +int ptrset_del(void *ptr, struct ptrset *set);
> +
> +/**
> + * ptrset_preload_end - end preload section started with ptrset_preload()
> + *
> + * Each ptrset_preload() should be matched with an invocation of this
> + * function. See ptrset_preload() for details.
> + */
> +static inline void ptrset_preload_end(void)
> +{
> + preempt_enable();
> +}
> +
> +/**
> + * ptrset_for_each - iterate pointers in a ptrset
> + * @ptr_cur: the cursor pointer variable (can be a pointer to any type)
> + * @set: the target ptrset
> + * @iter: pointer to struct ptrset_iter used to store iteration state
> + *
> + * Iterate @ptr_cur over all pointers in @set using @iter as the temp
> + * storage. The order of iteration is undefined and the iterated pointers
> + * may be deleted during iteration.
> + *
> + * The caller is responsible for synchronization. This is not RCU safe.
> + */
> +#define ptrset_for_each(ptr_cur, set, iter) \
> + rbtree_postorder_for_each_entry_safe((iter)->pos, (iter)->next, \
> + &(set)->root, node) \
> + if (({ (ptr_cur) = (iter)->pos->ptr; \
> + false; })) \
> + ; \
> + else
> +
> +#endif /* __PTRSET_H */
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLA
> endif
>
> lib-y := ctype.o string.o vsprintf.o cmdline.o \
> - rbtree.o radix-tree.o dump_stack.o timerqueue.o\
> + rbtree.o radix-tree.o dump_stack.o timerqueue.o ptrset.o \
> idr.o int_sqrt.o extable.o \
> sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
> proportions.o flex_proportions.o ratelimit.o show_mem.o \
> --- /dev/null
> +++ b/lib/ptrset.c
> @@ -0,0 +1,131 @@
> +/*
> + * lib/ptrset.c - set of pointers
> + *
> + * Copyright (C) 2014 Tejun Heo, Red Hat Inc.
> + */
> +
> +#include <linux/ptrset.h>
> +#include <linux/slab.h>
> +#include <linux/percpu.h>
> +
> +static DEFINE_PER_CPU(struct ptrset_elem *, ptrset_preload_elem);
> +
> +/**
> + * ptrset_preload - preload for ptrset_add()
> + * @gfp: allocation mask to use for preloading
> + *
> + * Preload per-cpu ptrset_elem buffer for ptrset_add(). Can only be used
> + * from process context and each ptrset_preload() invocation should be
> + * matched with ptrset_preload_end(). Note that preemption is disabled
> + * while preloaded.
> + *
> + * The first ptrset_add() in the preloaded section can be treated as if it
> + * were invoked with @gfp used for preloading. This allows using more
> + * permissive allocation masks for ptrsets protected by spinlocks.
> + */
> +void ptrset_preload(gfp_t gfp)
> +{
> + /*
> + * Consuming preload buffer from non-process context breaks preload
> + * allocation guarantee. Disallow usage from those contexts.
> + */
> + WARN_ON_ONCE(in_interrupt());
> + might_sleep_if(gfp & __GFP_WAIT);
> +
> + preempt_disable();
> +
> + if (!__this_cpu_read(ptrset_preload_elem)) {
> + struct ptrset_elem *new;
> +
> + preempt_enable();
> + new = kmalloc(sizeof(*new), gfp);
> + preempt_disable();
> +
> + new = __this_cpu_xchg(ptrset_preload_elem, new);
> + kfree(new);
> + }
> +}
> +
> +static __always_inline bool ptrset_find_slot(void *ptr, struct ptrset *set,
> + struct rb_node ***slotp,
> + struct rb_node **parentp)
> +{
> + struct rb_node **slot = &set->root.rb_node, *parent = NULL;
> +
> + while (*slot) {
> + struct ptrset_elem *elem =
> + container_of(*slot, struct ptrset_elem, node);
> +
> + parent = *slot;
> + if (ptr < elem->ptr) {
> + slot = &(*slot)->rb_left;
> + } else if (ptr > elem->ptr) {
> + slot = &(*slot)->rb_right;
> + } else {
> + *slotp = slot;
> + return true;
> + }
> + }
> +
> + *slotp = slot;
> + if (parentp)
> + *parentp = parent;
> + return false;
> +}
> +
> +/**
> + * ptrset_add - add a pointer to a ptrset
> + * @ptr: the pointer to add
> + * @set: the target ptrset
> + * @gfp: the allocation mask to use
> + *
> + * Allocate a ptrset_elem using @gfp, init with @ptr and add to @set.
> + * Returns 0 on success, -ENOMEM if allocation failed and -EEXIST if @ptr
> + * already exists in @set.
> + *
> + * The caller is responsible for synchronization.
> + */
> +int ptrset_add(void *ptr, struct ptrset *set, gfp_t gfp)
> +{
> + struct ptrset_elem *elem;
> + struct rb_node *parent, **slot;
> +
> + elem = kmalloc(sizeof(*elem), gfp);
> + if (!elem && !in_interrupt()) /* see ptrset_preload() */
> + elem = this_cpu_xchg(ptrset_preload_elem, NULL);
> + if (!elem)
> + return -ENOMEM;
> + elem->ptr = ptr;
> +
> + if (ptrset_find_slot(elem->ptr, set, &slot, &parent)) {
> + kfree(elem);
> + return -EEXIST;
> + }
> +
> + rb_link_node(&elem->node, parent, slot);
> + rb_insert_color(&elem->node, &set->root);
> + return 0;
> +}
> +
> +/**
> + * ptrset_del - delete a pointer from a ptrset
> + * @ptr: the pointer to delete
> + * @set: the target ptrset
> + *
> + * Delete @ptr from @set. Returns 0 on success, -ENOENT if @ptr is not in
> + * @set.
> + *
> + * The caller is responsible for synchronization.
> + */
> +int ptrset_del(void *ptr, struct ptrset *set)
> +{
> + struct rb_node **slot, *node;
> +
> + if (!ptrset_find_slot(ptr, set, &slot, NULL))
> + return -ENOENT;
> +
> + node = *slot;
> + rb_erase(node, &set->root);
> + kfree(container_of(node, struct ptrset_elem, node));
> + return 0;
> +}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <[email protected]>
SUSE Labs, CR

2014-12-02 18:15:57

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH vfs 1/2] lib: implement ptrset

Hello, Jan.

On Tue, Nov 25, 2014 at 05:37:07PM +0100, Jan Kara wrote:
> On Thu 13-11-14 17:09:27, Tejun Heo wrote:
> > Implement set of pointers. Pointers can be added, deleted and
> > iterated. It's currently implemented as a thin rbtree wrapper making
> > addition and removal O(log N). A drawback is that iteration isn't RCU
> > safe, which is okay for now. This will be used to remove
> > inode->i_devices.
> Tejun, I've noticed this somewhat late but I guess you haven't noticed my
> patch series starting with a patch:
> blockdev: Don't use i_devices inode field
>
> We can get rid of i_devices without tricks like this... I just need to
> improve how char devices are handled.

Ooh, that sounds perfect to me. Please ignore these patches.

Thanks!

--
tejun