2010-12-23 05:44:25

by Huang, Ying

[permalink] [raw]
Subject: [RFC -v9 0/4] Lock-less list

Add a lock-less NULL-terminated linked list implementation. And use
that in irq_work and replace net/rds/xlist.h.

v9:

- Split out lock-less allocator, will repost allocator with its user
in another patchset.
- Use lock-less list in irq_work and replace net/rds/xlist.h

v8:

- Rebased on mmotm 2010-12-02

v7:

- Revise ARCH_HAVE_NMI_SAFE_CMPXCHG definition for some architectures
according to architecture maitainers' comments.
- Remove spin_trylock_irqsave based fallback for lockless memory allocator,
because it does not work for !CONFIG_SMP and is not likely to be used.
- Make lockless memory allocator and list does not depend on
ARCH_HAVE_NMI_SAFE_CMPXCHG. Instead, require the user to depend on it
when needed. And BUG_ON(in_nmi()) is added in necessary place to prevent
silent race.

v6:

- Revise ARCH_HAVE_NMI_SAFE_CMPXCHG definition for some architectures
according to architecture maitainers' comments.

v5:

- Add ARCH_HAVE_NMI_SAFE_CMPXCHG
- Add spin_trylock_irqsave based fallback in lockless memory allocator
if ARCH_HAVE_NMI_SAFE_CMPXCHG=n
- Make lockless list depends on ARCH_HAVE_NMI_SAFE_CMPXCHG

v4:

- Split from APEI patchset
- Update patch description and comments according to ML comments

v3:

- Rework lockless memory allocator and list according to ML comments

[RFC -v9 1/4] Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG
[RFC -v9 2/4] lib, Add lock-less NULL terminated single list
[RFC -v9 3/4] irq_work, Use llist in irq_work
[RFC -v9 4/4] net, rds, Replace xlist in net/rds/xlist.h with llist


2010-12-23 05:44:28

by Huang, Ying

[permalink] [raw]
Subject: [RFC -v9 3/4] irq_work, Use llist in irq_work

Use llist in irq_work instead of the lock-less linked list
implementation in irq_work to avoid the code duplication.

Signed-off-by: Huang Ying <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
include/linux/irq_work.h | 15 ++++---
init/Kconfig | 1
kernel/irq_work.c | 90 ++++++++++++++++++-----------------------------
3 files changed, 46 insertions(+), 60 deletions(-)

--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -1,20 +1,23 @@
#ifndef _LINUX_IRQ_WORK_H
#define _LINUX_IRQ_WORK_H

+#include <linux/llist.h>
+
struct irq_work {
- struct irq_work *next;
+ unsigned long flags;
+ struct llist_node llnode;
void (*func)(struct irq_work *);
};

static inline
-void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
{
- entry->next = NULL;
- entry->func = func;
+ work->flags = 0;
+ work->func = func;
}

-bool irq_work_queue(struct irq_work *entry);
+bool irq_work_queue(struct irq_work *work);
void irq_work_run(void);
-void irq_work_sync(struct irq_work *entry);
+void irq_work_sync(struct irq_work *work);

#endif /* _LINUX_IRQ_WORK_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -27,6 +27,7 @@ config HAVE_IRQ_WORK
config IRQ_WORK
bool
depends on HAVE_IRQ_WORK
+ select LLIST

menu "General setup"

--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,49 +17,34 @@
* claimed NULL, 3 -> {pending} : claimed to be enqueued
* pending next, 3 -> {busy} : queued, pending callback
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
*/

#define IRQ_WORK_PENDING 1UL
#define IRQ_WORK_BUSY 2UL
#define IRQ_WORK_FLAGS 3UL

-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
-{
- return (unsigned long)entry->next & flags;
-}
+#define LIST_NONEMPTY_BIT 0

-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
- unsigned long next = (unsigned long)entry->next;
- next &= ~IRQ_WORK_FLAGS;
- return (struct irq_work *)next;
-}
+struct irq_work_list {
+ unsigned long flags;
+ struct llist_head llist;
+};

-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
- unsigned long next = (unsigned long)entry;
- next |= flags;
- return (struct irq_work *)next;
-}
-
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+static DEFINE_PER_CPU(struct irq_work_list, irq_work_lists);

/*
* Claim the entry so that no one else will poke at it.
*/
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
{
- struct irq_work *next, *nflags;
+ unsigned long flags, nflags;

do {
- next = entry->next;
- if ((unsigned long)next & IRQ_WORK_PENDING)
+ flags = work->flags;
+ if (flags & IRQ_WORK_PENDING)
return false;
- nflags = next_flags(next, IRQ_WORK_FLAGS);
- } while (cmpxchg(&entry->next, next, nflags) != next);
+ nflags = flags | IRQ_WORK_FLAGS;
+ } while (cmpxchg(&work->flags, flags, nflags) != flags);

return true;
}
@@ -75,20 +60,16 @@ void __weak arch_irq_work_raise(void)
/*
* Queue the entry and raise the IPI if needed.
*/
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
{
- struct irq_work **head, *next;
+ struct irq_work_list *irq_work_list;

- head = &get_cpu_var(irq_work_list);
+ irq_work_list = &get_cpu_var(irq_work_lists);

- do {
- next = *head;
- /* Can assign non-atomic because we keep the flags set. */
- entry->next = next_flags(next, IRQ_WORK_FLAGS);
- } while (cmpxchg(head, next, entry) != next);
+ llist_add(&work->llnode, &irq_work_list->llist);

/* The list was empty, raise self-interrupt to start processing. */
- if (!irq_work_next(entry))
+ if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
arch_irq_work_raise();

put_cpu_var(irq_work_list);
@@ -100,16 +81,16 @@ static void __irq_work_queue(struct irq_
*
* Can be re-enqueued while the callback is still in progress.
*/
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
{
- if (!irq_work_claim(entry)) {
+ if (!irq_work_claim(work)) {
/*
* Already enqueued, can't do!
*/
return false;
}

- __irq_work_queue(entry);
+ __irq_work_queue(work);
return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +101,35 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
*/
void irq_work_run(void)
{
- struct irq_work *list, **head;
+ struct irq_work_list *irq_work_list;
+ struct llist_node *llnode;
+ struct irq_work *work;

- head = &__get_cpu_var(irq_work_list);
- if (*head == NULL)
+ irq_work_list = &__get_cpu_var(irq_work_lists);
+ if (llist_empty(&irq_work_list->llist))
return;

BUG_ON(!in_irq());
BUG_ON(!irqs_disabled());

- list = xchg(head, NULL);
- while (list != NULL) {
- struct irq_work *entry = list;
+ clear_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags);
+ llnode = llist_del_all(&irq_work_list->llist);
+ while (llnode != NULL) {
+ work = llist_entry(llnode, struct irq_work, llnode);

- list = irq_work_next(list);
+ llnode = llnode->next;

/*
- * Clear the PENDING bit, after this point the @entry
+ * Clear the PENDING bit, after this point the @work
* can be re-used.
*/
- entry->next = next_flags(NULL, IRQ_WORK_BUSY);
- entry->func(entry);
+ work->flags = IRQ_WORK_BUSY;
+ work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&entry->next,
- next_flags(NULL, IRQ_WORK_BUSY),
- NULL);
+ (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
}
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +138,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
*/
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
{
WARN_ON_ONCE(irqs_disabled());

- while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+ while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

2010-12-23 05:44:27

by Huang, Ying

[permalink] [raw]
Subject: [RFC -v9 4/4] net, rds, Replace xlist in net/rds/xlist.h with llist

The functionality of xlist and llist is almost same. This patch
replace xlist with llist to avoid code duplication.

Known issues: don't know how to test this, need special hardware?

Signed-off-by: Huang Ying <[email protected]>
Cc: Chris Mason <[email protected]>
---
net/rds/Kconfig | 1
net/rds/ib_rdma.c | 110 ++++++++++++++++++++++++------------------------------
net/rds/xlist.h | 80 ---------------------------------------
3 files changed, 50 insertions(+), 141 deletions(-)
delete mode 100644 net/rds/xlist.h

--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -9,6 +9,7 @@ config RDS

config RDS_RDMA
tristate "RDS over Infiniband and iWARP"
+ select LLIST
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
Allow RDS to use Infiniband and iWARP as a transport.
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -33,10 +33,10 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/rculist.h>
+#include <linux/llist.h>

#include "rds.h"
#include "ib.h"
-#include "xlist.h"

static struct workqueue_struct *rds_ib_fmr_wq;

@@ -51,7 +51,7 @@ struct rds_ib_mr {
struct rds_ib_mr_pool *pool;
struct ib_fmr *fmr;

- struct xlist_head xlist;
+ struct llist_node llnode;

/* unmap_list is for freeing */
struct list_head unmap_list;
@@ -73,9 +73,9 @@ struct rds_ib_mr_pool {
atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */

- struct xlist_head drop_list; /* MRs that have reached their max_maps limit */
- struct xlist_head free_list; /* unused MRs */
- struct xlist_head clean_list; /* global unused & unamapped MRs */
+ struct llist_head drop_list; /* MRs that have reached their max_maps limit */
+ struct llist_head free_list; /* unused MRs */
+ struct llist_head clean_list; /* global unused & unamapped MRs */
wait_queue_head_t flush_wait;

atomic_t free_pinned; /* memory pinned by free MRs */
@@ -222,9 +222,9 @@ struct rds_ib_mr_pool *rds_ib_create_mr_
if (!pool)
return ERR_PTR(-ENOMEM);

- INIT_XLIST_HEAD(&pool->free_list);
- INIT_XLIST_HEAD(&pool->drop_list);
- INIT_XLIST_HEAD(&pool->clean_list);
+ init_llist_head(&pool->free_list);
+ init_llist_head(&pool->drop_list);
+ init_llist_head(&pool->clean_list);
mutex_init(&pool->flush_lock);
init_waitqueue_head(&pool->flush_wait);
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
@@ -262,26 +262,18 @@ void rds_ib_destroy_mr_pool(struct rds_i
kfree(pool);
}

-static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
- struct rds_ib_mr **ibmr_ret)
-{
- struct xlist_head *ibmr_xl;
- ibmr_xl = xlist_del_head_fast(xl);
- *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
-}
-
static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
- struct xlist_head *ret;
+ struct llist_node *ret;
unsigned long *flag;

preempt_disable();
flag = &__get_cpu_var(clean_list_grace);
set_bit(CLEAN_LIST_BUSY_BIT, flag);
- ret = xlist_del_head(&pool->clean_list);
+ ret = llist_del_first(&pool->clean_list);
if (ret)
- ibmr = list_entry(ret, struct rds_ib_mr, xlist);
+ ibmr = llist_entry(ret, struct rds_ib_mr, llnode);

clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
@@ -531,46 +523,42 @@ static inline unsigned int rds_ib_flush_
}

/*
- * given an xlist of mrs, put them all into the list_head for more processing
+ * given an llist of mrs, put them all into the list_head for more processing
*/
-static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
{
struct rds_ib_mr *ibmr;
- struct xlist_head splice;
- struct xlist_head *cur;
- struct xlist_head *next;
-
- splice.next = NULL;
- xlist_splice(xlist, &splice);
- cur = splice.next;
- while (cur) {
- next = cur->next;
- ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+ struct llist_node *node;
+ struct llist_node *next;
+
+ node = llist_del_all(llist);
+ while (node) {
+ next = node->next;
+ ibmr = llist_entry(node, struct rds_ib_mr, llnode);
list_add_tail(&ibmr->unmap_list, list);
- cur = next;
+ node = next;
}
}

/*
- * this takes a list head of mrs and turns it into an xlist of clusters.
- * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
- * reuse.
+ * this takes a list head of mrs and turns it into linked llist nodes.
*/
-static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
- struct list_head *list, struct xlist_head *xlist,
- struct xlist_head **tail_ret)
+static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
+ struct list_head *list,
+ struct llist_node **nodes_head,
+ struct llist_node **nodes_tail)
{
struct rds_ib_mr *ibmr;
- struct xlist_head *cur_mr = xlist;
- struct xlist_head *tail_mr = NULL;
+ struct llist_node *cur = NULL;
+ struct llist_node **next = nodes_head;

list_for_each_entry(ibmr, list, unmap_list) {
- tail_mr = &ibmr->xlist;
- tail_mr->next = NULL;
- cur_mr->next = tail_mr;
- cur_mr = tail_mr;
+ cur = &ibmr->llnode;
+ *next = cur;
+ next = &cur->next;
}
- *tail_ret = tail_mr;
+ *next = NULL;
+ *nodes_tail = cur;
}

/*
@@ -583,8 +571,8 @@ static int rds_ib_flush_mr_pool(struct r
int free_all, struct rds_ib_mr **ibmr_ret)
{
struct rds_ib_mr *ibmr, *next;
- struct xlist_head clean_xlist;
- struct xlist_head *clean_tail;
+ struct llist_node *clean_nodes;
+ struct llist_node *clean_tail;
LIST_HEAD(unmap_list);
LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
@@ -605,7 +593,7 @@ static int rds_ib_flush_mr_pool(struct r

prepare_to_wait(&pool->flush_wait, &wait,
TASK_UNINTERRUPTIBLE);
- if (xlist_empty(&pool->clean_list))
+ if (llist_empty(&pool->clean_list))
schedule();

ibmr = rds_ib_reuse_fmr(pool);
@@ -630,10 +618,10 @@ static int rds_ib_flush_mr_pool(struct r
/* Get the list of all MRs to be dropped. Ordering matters -
* we want to put drop_list ahead of free_list.
*/
- xlist_append_to_list(&pool->drop_list, &unmap_list);
- xlist_append_to_list(&pool->free_list, &unmap_list);
+ llist_append_to_list(&pool->drop_list, &unmap_list);
+ llist_append_to_list(&pool->free_list, &unmap_list);
if (free_all)
- xlist_append_to_list(&pool->clean_list, &unmap_list);
+ llist_append_to_list(&pool->clean_list, &unmap_list);

free_goal = rds_ib_flush_goal(pool, free_all);

@@ -665,22 +653,22 @@ static int rds_ib_flush_mr_pool(struct r
if (!list_empty(&unmap_list)) {
/* we have to make sure that none of the things we're about
* to put on the clean list would race with other cpus trying
- * to pull items off. The xlist would explode if we managed to
+ * to pull items off. The llist would explode if we managed to
* remove something from the clean list and then add it back again
- * while another CPU was spinning on that same item in xlist_del_head.
+ * while another CPU was spinning on that same item in llist_del_first.
*
- * This is pretty unlikely, but just in case wait for an xlist grace period
+ * This is pretty unlikely, but just in case wait for an llist grace period
* here before adding anything back into the clean list.
*/
wait_clean_list_grace();

- list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+ list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
if (ibmr_ret)
- refill_local(pool, &clean_xlist, ibmr_ret);
+ *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);

- /* refill_local may have emptied our list */
- if (!xlist_empty(&clean_xlist))
- xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+ /* more than one entry in llist nodes */
+ if (clean_nodes->next)
+ llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);

}

@@ -731,9 +719,9 @@ void rds_ib_free_mr(void *trans_private,

/* Return it to the pool's free list */
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
+ llist_add(&ibmr->llnode, &pool->drop_list);
else
- xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
+ llist_add(&ibmr->llnode, &pool->free_list);

atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
--- a/net/rds/xlist.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef _LINUX_XLIST_H
-#define _LINUX_XLIST_H
-
-#include <linux/stddef.h>
-#include <linux/poison.h>
-#include <linux/prefetch.h>
-#include <asm/system.h>
-
-struct xlist_head {
- struct xlist_head *next;
-};
-
-static inline void INIT_XLIST_HEAD(struct xlist_head *list)
-{
- list->next = NULL;
-}
-
-static inline int xlist_empty(struct xlist_head *head)
-{
- return head->next == NULL;
-}
-
-static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
- struct xlist_head *head)
-{
- struct xlist_head *cur;
- struct xlist_head *check;
-
- while (1) {
- cur = head->next;
- tail->next = cur;
- check = cmpxchg(&head->next, cur, new);
- if (check == cur)
- break;
- }
-}
-
-static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
-{
- struct xlist_head *cur;
- struct xlist_head *check;
- struct xlist_head *next;
-
- while (1) {
- cur = head->next;
- if (!cur)
- goto out;
-
- next = cur->next;
- check = cmpxchg(&head->next, cur, next);
- if (check == cur)
- goto out;
- }
-out:
- return cur;
-}
-
-static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
-{
- struct xlist_head *cur;
-
- cur = head->next;
- if (!cur)
- return NULL;
-
- head->next = cur->next;
- return cur;
-}
-
-static inline void xlist_splice(struct xlist_head *list,
- struct xlist_head *head)
-{
- struct xlist_head *cur;
-
- WARN_ON(head->next);
- cur = xchg(&list->next, NULL);
- head->next = cur;
-}
-
-#endif

2010-12-23 05:44:48

by Huang, Ying

[permalink] [raw]
Subject: [RFC -v9 2/4] lib, Add lock-less NULL terminated single list

Cmpxchg is used to implement adding new entry to the list, deleting
all entries from the list, deleting first entry of the list and some
other operations.

Because this is a single list, so the tail can not be accessed in O(1).

If there are multiple producers and multiple consumers, llist_add can
be used in producers and llist_del_all can be used in consumers. They
can work simultaneously without lock. But llist_del_first can not be
used here. Because llist_del_first depends on list->first->next does
not changed if list->first is not changed during its operation, but
llist_del_first, llist_add, llist_add sequence in another consumer may
violate that.

If there are multiple producers and one consumer, llist_add can be
used in producers and llist_del_all or llist_del_first can be used in
the consumer.

The list entries deleted via llist_del_all can be traversed with
traversing function such as llist_for_each etc. But the list entries
can not be traversed safely before deleted from the list without
proper synchronization with the list consumers.

The basic atomic operation of this list is cmpxchg on long. On
architectures that don't have NMI-safe cmpxchg implementation, the
list can NOT be used in NMI handler. So code uses the list in NMI
handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.

Signed-off-by: Huang Ying <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
---
include/linux/llist.h | 99 +++++++++++++++++++++++++++++++++++++++++
lib/Kconfig | 3 +
lib/Makefile | 2
lib/llist.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 223 insertions(+)
create mode 100644 include/linux/llist.h
create mode 100644 lib/llist.c

--- /dev/null
+++ b/include/linux/llist.h
@@ -0,0 +1,99 @@
+#ifndef LLIST_H
+#define LLIST_H
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * If there are multiple producers and multiple consumers, llist_add
+ * can be used in producers and llist_del_all can be used in
+ * consumers. They can work simultaneously without lock. But
+ * llist_del_first can not be used here. Because llist_del_first
+ * depends on list->first->next does not changed if list->first is not
+ * changed during its operation, but llist_del_first, llist_add,
+ * llist_add sequence in another consumer may violate that.
+ *
+ * If there are multiple producers and one consumer, llist_add can be
+ * used in producers and llist_del_all or llist_del_first can be used
+ * in the consumer.
+ *
+ * The list entries deleted via llist_del_all can be traversed with
+ * traversing function such as llist_for_each etc. But the list
+ * entries can not be traversed safely before deleted from the list
+ * without proper synchronization with the list consumers.
+ *
+ * The basic atomic operation of this list is cmpxchg on long. On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handler. So code uses the list in NMI
+ * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ */
+
+struct llist_head {
+ struct llist_node *first;
+};
+
+struct llist_node {
+ struct llist_node *next;
+};
+
+#define LLIST_HEAD_INIT(name) { NULL }
+#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name)
+
+/**
+ * init_llist_head - initialize lock-less list head
+ * @head: the head for your lock-less list
+ */
+static inline void init_llist_head(struct llist_head *list)
+{
+ list->first = NULL;
+}
+
+/**
+ * llist_entry - get the struct of this entry
+ * @ptr: the &struct llist_node pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the llist_node within the struct.
+ */
+#define llist_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+/**
+ * llist_for_each - iterate over some entries of a lock-less list
+ * @pos: the &struct llist_node to use as a loop cursor
+ * @node: the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ */
+#define llist_for_each(pos, node) \
+ for (pos = (node); pos; pos = pos->next)
+
+/**
+ * llist_for_each_entry - iterate over some entries of lock-less list of given type
+ * @pos: the type * to use as a loop cursor.
+ * @node: the fist entry of deleted list entries.
+ * @member: the name of the llist_node with the struct.
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being removed from list, so start with an entry
+ * instead of list head.
+ */
+#define llist_for_each_entry(pos, node, member) \
+ for (pos = llist_entry((node), typeof(*pos), member); \
+ &pos->member != NULL; \
+ pos = llist_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * llist_empty - tests whether a lock-less list is empty
+ * @head: the list to test
+ */
+static inline int llist_empty(const struct llist_head *head)
+{
+ return head->first == NULL;
+}
+
+void llist_add(struct llist_node *new, struct llist_head *head);
+void llist_add_batch(struct llist_node *new, struct llist_node *tail,
+ struct llist_head *head);
+struct llist_node *llist_del_first(struct llist_head *head);
+struct llist_node *llist_del_all(struct llist_head *head);
+#endif /* LLIST_H */
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -210,4 +210,7 @@ config GENERIC_ATOMIC64
config LRU_CACHE
tristate

+config LLIST
+ bool
+
endmenu
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,6 +106,8 @@ obj-$(CONFIG_GENERIC_ATOMIC64) += atomic

obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o

+obj-$(CONFIG_LLIST) += llist.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h

--- /dev/null
+++ b/lib/llist.c
@@ -0,0 +1,119 @@
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * The basic atomic operation of this list is cmpxchg on long. On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handler. So code uses the list in NMI
+ * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/llist.h>
+
+#include <asm/system.h>
+
+/**
+ * llist_add - add a new entry
+ * @new: new entry to be added
+ * @head: the head for your lock-less list
+ */
+void llist_add(struct llist_node *new, struct llist_head *head)
+{
+ struct llist_node *entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ do {
+ entry = head->first;
+ new->next = entry;
+ } while (cmpxchg(&head->first, entry, new) != entry);
+}
+EXPORT_SYMBOL_GPL(llist_add);
+
+/**
+ * llist_add_batch - add several linked entries in batch
+ * @new: first entry in batch to be added
+ * @tail: last entry in batch to be added
+ * @head: the head for your lock-less list
+ */
+void llist_add_batch(struct llist_node *new, struct llist_node *tail,
+ struct llist_head *head)
+{
+ struct llist_node *entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ do {
+ entry = head->first;
+ tail->next = entry;
+ } while (cmpxchg(&head->first, entry, new) != entry);
+}
+EXPORT_SYMBOL_GPL(llist_add_batch);
+
+/**
+ * llist_del_first - delete the first entry of lock-less list
+ * @head: the head for your lock-less list
+ *
+ * If list is empty, return NULL, otherwise, return the first entry deleted.
+ *
+ * Only one llist_del_first user can be used simultaneously with
+ * multiple llist_add users without lock. Because otherwise
+ * llist_del_first, llist_add, llist_add sequence in another user may
+ * change @head->first->next, but keep @head->first. If multiple
+ * consumers are needed, please use llist_del_all.
+ */
+struct llist_node *llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ do {
+ entry = head->first;
+ if (entry == NULL)
+ return NULL;
+ } while (cmpxchg(&head->first, entry, entry->next) != entry);
+
+ return entry;
+}
+EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head: the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry.
+ */
+struct llist_node *llist_del_all(struct llist_head *head)
+{
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ return xchg(&head->first, NULL);
+}
+EXPORT_SYMBOL_GPL(llist_del_all);

2010-12-23 05:45:09

by Huang, Ying

[permalink] [raw]
Subject: [RFC -v9 1/4] Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG

cmpxchg() is widely used by lockless code, including NMI-safe lockless
code. But on some architectures, the cmpxchg() implementation is not
NMI-safe, on these architectures the lockless code may need to a
spin_trylock_irqsave() based implementation.

This patch adds a Kconfig option: ARCH_HAVE_NMI_SAFE_CMPXCHG, so that
NMI-safe lockless code can depend on it or provide different
implementation according to it.

On many architectures, cmpxchg is only NMI-safe for several specific
operand sizes. So, ARCH_HAVE_NMI_SAFE_CMPXCHG define in this patch
only guarantees cmpxchg is NMI-safe for sizeof(unsigned long).

Signed-off-by: Huang Ying <[email protected]>
Acked-by: Mike Frysinger <[email protected]>
Acked-by: Paul Mundt <[email protected]>
Acked-by: Hans-Christian Egtvedt <[email protected]>
Acked-by: Benjamin Herrenschmidt <[email protected]>
Acked-by: Chris Metcalf <[email protected]>
CC: Richard Henderson <[email protected]>
CC: Russell King <[email protected]>
CC: Mikael Starvik <[email protected]>
CC: David Howells <[email protected]>
CC: Yoshinori Sato <[email protected]>
CC: Tony Luck <[email protected]>
CC: Hirokazu Takata <[email protected]>
CC: Geert Uytterhoeven <[email protected]>
CC: Michal Simek <[email protected]>
CC: Ralf Baechle <[email protected]>
CC: Kyle McMartin <[email protected]>
CC: Martin Schwidefsky <[email protected]>
CC: Chen Liqin <[email protected]>
CC: "David S. Miller" <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Chris Zankel <[email protected]>
---
arch/Kconfig | 3 +++
arch/alpha/Kconfig | 1 +
arch/avr32/Kconfig | 1 +
arch/frv/Kconfig | 1 +
arch/ia64/Kconfig | 1 +
arch/m68k/Kconfig | 1 +
arch/parisc/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/s390/Kconfig | 1 +
arch/sh/Kconfig | 1 +
arch/sparc/Kconfig | 1 +
arch/tile/Kconfig | 1 +
arch/x86/Kconfig | 1 +
13 files changed, 15 insertions(+)

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
config HAVE_ARCH_JUMP_LABEL
bool

+config ARCH_HAVE_NMI_SAFE_CMPXCHG
+ bool
+
source "kernel/gcov/Kconfig"
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -7,6 +7,7 @@ config ALPHA
select HAVE_SYSCALL_WRAPPERS
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
select HAVE_DMA_ATTRS
help
The Alpha is a 64-bit general-purpose processor designed and
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -6,6 +6,7 @@ config AVR32
select HAVE_CLK
select HAVE_OPROFILE
select HAVE_KPROBES
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -5,6 +5,7 @@ config FRV
select HAVE_ARCH_TRACEHOOK
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG

config ZONE_DMA
bool
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -22,6 +22,7 @@ config IA64
select HAVE_KVM
select HAVE_ARCH_TRACEHOOK
select HAVE_DMA_API_DEBUG
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
default y
help
The Itanium Processor Family is Intel's 64-bit successor to
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -4,6 +4,7 @@ config M68K
select HAVE_AOUT
select HAVE_IDE
select GENERIC_ATOMIC64
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS

config MMU
bool
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -11,6 +11,7 @@ config PARISC
select BUG
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_ATOMIC64 if !64BIT
select GENERIC_HARDIRQS_NO__DO_IRQ
help
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
select GENERIC_ATOMIC64 if PPC32
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64

--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -94,6 +94,7 @@ config S390
select INIT_ALL_POSSIBLE
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -11,6 +11,7 @@ config SUPERH
select HAVE_DMA_ATTRS
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
select PERF_USE_VMALLOC
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -22,6 +22,7 @@ config SPARC
select RTC_CLASS
select RTC_DRV_M48T59
select HAVE_IRQ_WORK
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
select HAVE_ARCH_JUMP_LABEL
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -104,6 +104,7 @@ config TILE
select GENERIC_FIND_NEXT_BIT
select USE_GENERIC_SMP_HELPERS
select CC_OPTIMIZE_FOR_SIZE
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG

# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
select HAVE_OPROFILE
select HAVE_PERF_EVENTS
select HAVE_IRQ_WORK
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select HAVE_MEMBLOCK

2010-12-23 06:07:59

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [RFC -v9 0/4] Lock-less list

On Thu, 23 Dec 2010 13:43:19 +0800, Huang Ying said:
> Add a lock-less NULL-terminated linked list implementation. And use
> that in irq_work and replace net/rds/xlist.h.

A quick overview of the code looks mostly sane. What I don't see is
an explanation of *why* this is being added. What benefits does it
have over the current code? Is it faster? Smaller? Simply getting rid
of near-duplicate versions in rds and irq_work? Something else?



Attachments:
(No filename) (227.00 B)

2010-12-23 07:06:11

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [RFC -v9 4/4] net, rds, Replace xlist in net/rds/xlist.h with llist

On Thu, 23 Dec 2010 13:43:23 +0800, Huang Ying said:
> The functionality of xlist and llist is almost same. This patch
> replace xlist with llist to avoid code duplication.

> /*
> - * this takes a list head of mrs and turns it into an xlist of clusters.
> - * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
> - * reuse.
> + * this takes a list head of mrs and turns it into linked llist nodes.
> */

This comment change loses a lot of information. The original 3 lines
tells me a lot about what the data structure is and what it's used for,
the replacement is a 'b +=5; /* add 5 to b */' type of comment.



Attachments:
(No filename) (227.00 B)

2010-12-23 08:31:42

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC -v9 0/4] Lock-less list

Hi, Valdis,

Thanks for your comments.

On Thu, 2010-12-23 at 14:05 +0800, [email protected] wrote:
> On Thu, 23 Dec 2010 13:43:19 +0800, Huang Ying said:
> > Add a lock-less NULL-terminated linked list implementation. And use
> > that in irq_work and replace net/rds/xlist.h.
>
> A quick overview of the code looks mostly sane. What I don't see is
> an explanation of *why* this is being added. What benefits does it
> have over the current code? Is it faster? Smaller? Simply getting rid
> of near-duplicate versions in rds and irq_work? Something else?

The code is almost same. Just to avoid code duplicating, provide better
document, and make it easier for future users.

Best Regards,
Huang Ying


2010-12-23 08:31:52

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC -v9 4/4] net, rds, Replace xlist in net/rds/xlist.h with llist

Hi,
On Thu, 2010-12-23 at 15:05 +0800, [email protected] wrote:
> On Thu, 23 Dec 2010 13:43:23 +0800, Huang Ying said:
> > The functionality of xlist and llist is almost same. This patch
> > replace xlist with llist to avoid code duplication.
>
> > /*
> > - * this takes a list head of mrs and turns it into an xlist of clusters.
> > - * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
> > - * reuse.
> > + * this takes a list head of mrs and turns it into linked llist nodes.
> > */
>
> This comment change loses a lot of information. The original 3 lines
> tells me a lot about what the data structure is and what it's used for,
> the replacement is a 'b +=5; /* add 5 to b */' type of comment.

Sorry, maybe I misunderstand the comments and code. From comments I
imagine the result is a two-dimension lock-less list, but from the code
I only find an one-dimension lock-less list. Is it?

Best Regards,
Huang Ying

2010-12-23 17:35:19

by David Miller

[permalink] [raw]
Subject: Re: [RFC -v9 1/4] Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG

From: Huang Ying <[email protected]>
Date: Thu, 23 Dec 2010 13:43:20 +0800

> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -22,6 +22,7 @@ config SPARC
> select RTC_CLASS
> select RTC_DRV_M48T59
> select HAVE_IRQ_WORK
> + select ARCH_HAVE_NMI_SAFE_CMPXCHG
> select HAVE_DMA_ATTRS
> select HAVE_DMA_API_DEBUG
> select HAVE_ARCH_JUMP_LABEL

This should only be set for "SPARC64". SPARC32's cmpxchg uses spinlocks and
IRQ disabling, which is not NMI safe.

2010-12-23 21:53:42

by Chris Mason

[permalink] [raw]
Subject: Re: [RFC -v9 4/4] net, rds, Replace xlist in net/rds/xlist.h with llist

Excerpts from Huang Ying's message of 2010-12-23 00:43:23 -0500:
> The functionality of xlist and llist is almost same. This patch
> replace xlist with llist to avoid code duplication.

I'm not against it, the goal was always to move this to include/linux as
more users popped up. We can test it out here with rds RDMA.

-chris

2010-12-24 00:29:59

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC -v9 4/4] net, rds, Replace xlist in net/rds/xlist.h with llist

On Fri, 2010-12-24 at 05:51 +0800, Chris Mason wrote:
> Excerpts from Huang Ying's message of 2010-12-23 00:43:23 -0500:
> > The functionality of xlist and llist is almost same. This patch
> > replace xlist with llist to avoid code duplication.
>
> I'm not against it, the goal was always to move this to include/linux as
> more users popped up. We can test it out here with rds RDMA.

Thanks for your helping.

Best Regards,
Huang Ying

2010-12-24 00:56:52

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC -v9 1/4] Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG

Hi, David,

Thanks for review.

On Fri, 2010-12-24 at 01:35 +0800, David Miller wrote:
> From: Huang Ying <[email protected]>
> Date: Thu, 23 Dec 2010 13:43:20 +0800
>
> > --- a/arch/sparc/Kconfig
> > +++ b/arch/sparc/Kconfig
> > @@ -22,6 +22,7 @@ config SPARC
> > select RTC_CLASS
> > select RTC_DRV_M48T59
> > select HAVE_IRQ_WORK
> > + select ARCH_HAVE_NMI_SAFE_CMPXCHG
> > select HAVE_DMA_ATTRS
> > select HAVE_DMA_API_DEBUG
> > select HAVE_ARCH_JUMP_LABEL
>
> This should only be set for "SPARC64". SPARC32's cmpxchg uses spinlocks and
> IRQ disabling, which is not NMI safe.

I will fix this.

Best Regards,
Huang Ying