Hello Gentleman,
This is a development of "percpu_ida" library. I named it
"percpu_tags" to simplify review, since most of "percpu_ida"
code has gone and a diff would not be informative.
While the concept of per-cpu arrays is is preserved, the
implementation is heavily reworked. The main objective is to
improve the "percpu_ida" locking scheme.
Here is the list of major differrences between "percpu_ida" and
"percpu_tags":
* The global freelist has gone. As result, CPUs do not compete
for the global lock.
* Long-running operatons (scanning thru a cpumask) are executed
with local interrupts enabled;
* percpu_ida::percpu_max_size limit is eliminated. Instead, the
limit is determined dynamically. Depending from how many CPUs
are requesting tags each CPU gets a fair share of the tag space;
* A tag is attempted to return to the CPU it was allocated on. As
result, it is expected the locality of data associated with the
tag benefits;
The code is largely raw and untested. The reason I am posting
is the prototype implementation performs 2-3 times faster than
"percpu_ida", so I would like to ensure if it worth going further
with this approach or is there a no-go.
The performance test is not decent, though. I used "fio" random
read against a "null_blk" device sitting on top of "percpu_tags",
which is not exactly how "percpu_ida" is used. This is another
reason I am posting - an advice on how to properly test is very
appreciated.
The source code could be found at
https://github.com/a-gordeev/linux.git percpu_tags-v0
Thanks!
Cc: [email protected]
Cc: [email protected]
Cc: Nicholas Bellinger <[email protected]>
Cc: Kent Overstreet <[email protected]>
Cc: "Michael S. Tsirkin" <[email protected]>
Alexander Gordeev (2):
percpu_tags: Prototype implementation
percpu_tags: Use percpu_tags instead of percpu_ida
drivers/scsi/qla2xxx/qla_target.c | 6 +-
drivers/target/iscsi/iscsi_target_util.c | 6 +-
drivers/target/target_core_transport.c | 4 +-
drivers/target/tcm_fc/tfc_cmd.c | 8 +-
drivers/vhost/scsi.c | 6 +-
include/linux/percpu_tags.h | 37 ++
include/target/target_core_base.h | 4 +-
lib/Makefile | 2 +-
lib/percpu_tags.c | 556 ++++++++++++++++++++++++++++++
9 files changed, 611 insertions(+), 18 deletions(-)
create mode 100644 include/linux/percpu_tags.h
create mode 100644 lib/percpu_tags.c
--
1.7.7.6
All users of "percpu_ida" are blindly converted to
"percpu_tags" users. No testing has been conducted.
Signed-off-by: Alexander Gordeev <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Nicholas Bellinger <[email protected]>
Cc: Kent Overstreet <[email protected]>
Cc: "Michael S. Tsirkin" <[email protected]>
---
drivers/scsi/qla2xxx/qla_target.c | 6 +++---
drivers/target/iscsi/iscsi_target_util.c | 6 +++---
drivers/target/target_core_transport.c | 4 ++--
drivers/target/tcm_fc/tfc_cmd.c | 8 ++++----
drivers/vhost/scsi.c | 6 +++---
include/target/target_core_base.h | 4 ++--
6 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index e632e14..cec4847 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -2729,7 +2729,7 @@ void qlt_free_cmd(struct qla_tgt_cmd *cmd)
WARN_ON(1);
return;
}
- percpu_ida_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
+ percpu_tags_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
}
EXPORT_SYMBOL(qlt_free_cmd);
@@ -3153,7 +3153,7 @@ out_term:
*/
spin_lock_irqsave(&ha->hardware_lock, flags);
qlt_send_term_exchange(vha, NULL, &cmd->atio, 1);
- percpu_ida_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
+ percpu_tags_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
ha->tgt.tgt_ops->put_sess(sess);
spin_unlock_irqrestore(&ha->hardware_lock, flags);
}
@@ -3173,7 +3173,7 @@ static struct qla_tgt_cmd *qlt_get_tag(scsi_qla_host_t *vha,
struct qla_tgt_cmd *cmd;
int tag;
- tag = percpu_ida_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
+ tag = percpu_tags_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
if (tag < 0)
return NULL;
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index fd90b28..f76729e 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -17,7 +17,7 @@
******************************************************************************/
#include <linux/list.h>
-#include <linux/percpu_ida.h>
+#include <linux/percpu_tags.h>
#include <scsi/scsi_tcq.h>
#include <scsi/iscsi_proto.h>
#include <target/target_core_base.h>
@@ -158,7 +158,7 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn *conn, int state)
struct se_session *se_sess = conn->sess->se_sess;
int size, tag;
- tag = percpu_ida_alloc(&se_sess->sess_tag_pool, state);
+ tag = percpu_tags_alloc(&se_sess->sess_tag_pool, state);
if (tag < 0)
return NULL;
@@ -701,7 +701,7 @@ void iscsit_release_cmd(struct iscsi_cmd *cmd)
kfree(cmd->iov_data);
kfree(cmd->text_in_ptr);
- percpu_ida_free(&sess->se_sess->sess_tag_pool, se_cmd->map_tag);
+ percpu_tags_free(&sess->se_sess->sess_tag_pool, se_cmd->map_tag);
}
EXPORT_SYMBOL(iscsit_release_cmd);
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 7fa62fc..5c7f6f4 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -272,7 +272,7 @@ int transport_alloc_session_tags(struct se_session *se_sess,
}
}
- rc = percpu_ida_init(&se_sess->sess_tag_pool, tag_num);
+ rc = percpu_tags_init(&se_sess->sess_tag_pool, tag_num);
if (rc < 0) {
pr_err("Unable to init se_sess->sess_tag_pool,"
" tag_num: %u\n", tag_num);
@@ -445,7 +445,7 @@ EXPORT_SYMBOL(transport_deregister_session_configfs);
void transport_free_session(struct se_session *se_sess)
{
if (se_sess->sess_cmd_map) {
- percpu_ida_destroy(&se_sess->sess_tag_pool);
+ percpu_tags_destroy(&se_sess->sess_tag_pool);
if (is_vmalloc_addr(se_sess->sess_cmd_map))
vfree(se_sess->sess_cmd_map);
else
diff --git a/drivers/target/tcm_fc/tfc_cmd.c b/drivers/target/tcm_fc/tfc_cmd.c
index be0c0d0..6176c92 100644
--- a/drivers/target/tcm_fc/tfc_cmd.c
+++ b/drivers/target/tcm_fc/tfc_cmd.c
@@ -28,7 +28,7 @@
#include <linux/configfs.h>
#include <linux/ctype.h>
#include <linux/hash.h>
-#include <linux/percpu_ida.h>
+#include <linux/percpu_tags.h>
#include <asm/unaligned.h>
#include <scsi/scsi.h>
#include <scsi/scsi_host.h>
@@ -100,7 +100,7 @@ static void ft_free_cmd(struct ft_cmd *cmd)
if (fr_seq(fp))
lport->tt.seq_release(fr_seq(fp));
fc_frame_free(fp);
- percpu_ida_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
+ percpu_tags_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
ft_sess_put(sess); /* undo get from lookup at recv */
}
@@ -456,7 +456,7 @@ static void ft_recv_cmd(struct ft_sess *sess, struct fc_frame *fp)
struct se_session *se_sess = sess->se_sess;
int tag;
- tag = percpu_ida_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
+ tag = percpu_tags_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
if (tag < 0)
goto busy;
@@ -467,7 +467,7 @@ static void ft_recv_cmd(struct ft_sess *sess, struct fc_frame *fp)
cmd->sess = sess;
cmd->seq = lport->tt.seq_assign(lport, fp);
if (!cmd->seq) {
- percpu_ida_free(&se_sess->sess_tag_pool, tag);
+ percpu_tags_free(&se_sess->sess_tag_pool, tag);
goto busy;
}
cmd->req_frame = fp; /* hold frame during cmd */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 69906ca..30dd962 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -48,7 +48,7 @@
#include <linux/virtio_scsi.h>
#include <linux/llist.h>
#include <linux/bitmap.h>
-#include <linux/percpu_ida.h>
+#include <linux/percpu_tags.h>
#include "vhost.h"
@@ -472,7 +472,7 @@ static void tcm_vhost_release_cmd(struct se_cmd *se_cmd)
}
tcm_vhost_put_inflight(tv_cmd->inflight);
- percpu_ida_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
+ percpu_tags_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
}
static int tcm_vhost_shutdown_session(struct se_session *se_sess)
@@ -739,7 +739,7 @@ vhost_scsi_get_tag(struct vhost_virtqueue *vq, struct tcm_vhost_tpg *tpg,
}
se_sess = tv_nexus->tvn_se_sess;
- tag = percpu_ida_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
+ tag = percpu_tags_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
if (tag < 0) {
pr_err("Unable to obtain tag for tcm_vhost_cmd\n");
return ERR_PTR(-ENOMEM);
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 9ec9864..fd06ecf 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -5,7 +5,7 @@
#include <linux/configfs.h>
#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
-#include <linux/percpu_ida.h>
+#include <linux/percpu_tags.h>
#include <scsi/scsi_cmnd.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -620,7 +620,7 @@ struct se_session {
spinlock_t sess_cmd_lock;
struct kref sess_kref;
void *sess_cmd_map;
- struct percpu_ida sess_tag_pool;
+ struct percpu_tags sess_tag_pool;
};
struct se_device;
--
1.7.7.6
This is a development of "percpu_ida" library. The major
differrences between "percpu_ida" and "percpu_tags" are:
* The global freelist has gone. As result, CPUs do not compete
for the global lock.
* Long-running operatons (scanning thru a cpumask) are executed
with local interrupts enabled;
* percpu_ida::percpu_max_size limit is eliminated. Instead, the
limit is determined dynamically. Depending from how many CPUs
are requesting tags each CPU gets a fair share of the tag space;
* A tag is attempted to return to the CPU it was allocated on. As
result, it is expected the locality of data associated with the
tag benefits;
Signed-off-by: Alexander Gordeev <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Nicholas Bellinger <[email protected]>
Cc: Kent Overstreet <[email protected]>
Cc: "Michael S. Tsirkin" <[email protected]>
---
include/linux/percpu_tags.h | 37 +++
lib/Makefile | 2 +-
lib/percpu_tags.c | 556 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 594 insertions(+), 1 deletions(-)
create mode 100644 include/linux/percpu_tags.h
create mode 100644 lib/percpu_tags.c
diff --git a/include/linux/percpu_tags.h b/include/linux/percpu_tags.h
new file mode 100644
index 0000000..ee89863
--- /dev/null
+++ b/include/linux/percpu_tags.h
@@ -0,0 +1,37 @@
+#ifndef __PERCPU_TAGS_H__
+#define __PERCPU_TAGS_H__
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
+#include <linux/cpumask.h>
+
+struct percpu_cache;
+
+struct percpu_tags {
+ int nr_tags;
+ struct percpu_cache __percpu *cache;
+
+ int *tag_cpu_map;
+
+ cpumask_t alloc_tags;
+ cpumask_t free_tags;
+ cpumask_t wait_tags;
+};
+
+int percpu_tags_alloc(struct percpu_tags *pt, int state);
+void percpu_tags_free(struct percpu_tags *pt, int tag);
+
+int percpu_tags_init(struct percpu_tags *pt, int nr_tags);
+void percpu_tags_destroy(struct percpu_tags *pt);
+
+int percpu_tags_for_each_free(struct percpu_tags *pt,
+ int (*fn)(unsigned, void *), void *data);
+int percpu_tags_free_tags(struct percpu_tags *pt, int cpu);
+
+#define PERCPU_TAGS_BATCH_MAX 4
+
+#endif /* __PERCPU_TAGS_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index ba967a1..671143f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
- percpu-refcount.o percpu_ida.o hash.o
+ percpu-refcount.o percpu_ida.o percpu_tags.o hash.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += kstrtox.o
diff --git a/lib/percpu_tags.c b/lib/percpu_tags.c
new file mode 100644
index 0000000..7bb61f5
--- /dev/null
+++ b/lib/percpu_tags.c
@@ -0,0 +1,556 @@
+/*
+ * Percpu tags library, based on percpu IDA library
+ *
+ * Copyright (C) 2014 RedHat, Inc. Alexander Gordeev
+ * Copyright (C) 2013 Datera, Inc. Kent Overstreet
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_tags.h>
+
+struct percpu_cache {
+ spinlock_t lock;
+ int cpu; /* CPU this cache belongs to */
+ wait_queue_head_t wait; /* tasks waiting for a tag */
+
+ int nr_alloc; /* nr of allocated tags */
+
+ int nr_free; /* nr of unallocated tags */
+ int freelist[];
+};
+
+#define spin_lock_irqsave_cond(lock, cpu, flags) \
+do { \
+ preempt_disable(); \
+ if ((cpu) == smp_processor_id()) \
+ local_irq_save(flags); \
+ spin_lock(lock); \
+ preempt_enable(); \
+} while (0)
+
+#define spin_unlock_irqrestore_cond(lock, cpu, flags) \
+do { \
+ int __this_cpu = smp_processor_id(); \
+ spin_unlock(lock); \
+ if (cpu == __this_cpu) \
+ local_irq_restore(flags); \
+} while (0)
+
+#define double_spin_lock_irqsave_cond(lock1, cpu1, lock2, cpu2, flags) \
+do { \
+ spinlock_t *__lock1 = (lock1); \
+ spinlock_t *__lock2 = (lock2); \
+ int __this_cpu; \
+ \
+ if (__lock1 > __lock2) \
+ swap(__lock1, __lock2); \
+ \
+ preempt_disable(); \
+ __this_cpu = smp_processor_id(); \
+ if ((cpu1) == __this_cpu || (cpu2) == __this_cpu) \
+ local_irq_save(flags); \
+ spin_lock(__lock1); \
+ spin_lock_nested(__lock2, SINGLE_DEPTH_NESTING); \
+ preempt_enable(); \
+} while (0)
+
+static inline void cpu_to_tag(struct percpu_tags *pt, int cpu, int tag)
+{
+ smp_rmb();
+ if (pt->tag_cpu_map[tag] != cpu) {
+ pt->tag_cpu_map[tag] = cpu;
+ smp_wmb();
+ }
+}
+
+static inline int cpu_from_tag(struct percpu_tags *pt, int tag)
+{
+ smp_rmb();
+ return pt->tag_cpu_map[tag];
+}
+
+static void move_tags(int *dst, int *nr_dst, int *src, int *nr_src, size_t nr)
+{
+ *nr_src -= nr;
+ memcpy(dst + *nr_dst, src + *nr_src, sizeof(*dst) * nr);
+ *nr_dst += nr;
+}
+
+static int batch_size(int nr_cache)
+{
+ return max(1, min(PERCPU_TAGS_BATCH_MAX, nr_cache / 4));
+}
+
+static int cache_size(struct percpu_tags *pt)
+{
+ int weight;
+
+ /*
+ * Bitmask percpu_tags::alloc_tags is used to indicate the number
+ * of currently active CPUs, although it is unlikely reflects a
+ * CPU activity reliably - we need a better heuristic here.
+ */
+ smp_rmb();
+ weight = cpumask_weight(&pt->alloc_tags);
+
+ if (weight)
+ return pt->nr_tags / weight;
+ else
+ return pt->nr_tags;
+}
+
+static int alloc_tag_local(struct percpu_tags *pt)
+{
+ bool scarce = pt->nr_tags < cpumask_weight(cpu_online_mask);
+ int nr_cache = cache_size(pt);
+ struct percpu_cache *tags = this_cpu_ptr(pt->cache);
+ int cpu = tags->cpu;
+ unsigned long uninitialized_var(flags);
+ int tag;
+
+ spin_lock_irqsave_cond(&tags->lock, cpu, flags);
+
+ if (!tags->nr_free ||
+ (!scarce && (tags->nr_free + tags->nr_alloc > nr_cache))) {
+ spin_unlock_irqrestore_cond(&tags->lock, cpu, flags);
+ return -ENOSPC;
+ }
+
+ tags->nr_alloc++;
+ tags->nr_free--;
+
+ tag = tags->freelist[tags->nr_free];
+
+ if (!tags->nr_free)
+ cpumask_clear_cpu(cpu, &pt->free_tags);
+ cpumask_set_cpu(cpu, &pt->alloc_tags);
+
+ spin_unlock_irqrestore_cond(&tags->lock, cpu, flags);
+
+ cpu_to_tag(pt, cpu, tag);
+
+ return tag;
+}
+
+static int pull_tag(struct percpu_tags *pt,
+ struct percpu_cache *dtags, struct percpu_cache *stags,
+ bool force)
+{
+ int nr_cache = cache_size(pt);
+ int nr_batch = batch_size(nr_cache);
+ int nr_pull;
+ unsigned long uninitialized_var(flags);
+ int tag;
+
+ double_spin_lock_irqsave_cond(&stags->lock, stags->cpu,
+ &dtags->lock, dtags->cpu, flags);
+
+ if (force) {
+ nr_pull = min(nr_batch, stags->nr_free);
+ } else {
+ nr_pull = stags->nr_free + stags->nr_alloc - nr_cache;
+ nr_pull = min(nr_pull, stags->nr_free);
+ nr_pull = min(nr_pull, nr_batch);
+ }
+
+ if (nr_pull < 1) {
+ spin_unlock_irqrestore_cond(&dtags->lock, dtags->cpu, flags);
+ spin_unlock_irqrestore_cond(&stags->lock, stags->cpu, flags);
+ return -ENOSPC;
+ }
+
+ move_tags(dtags->freelist, &dtags->nr_free,
+ stags->freelist, &stags->nr_free, nr_pull);
+
+ dtags->nr_alloc++;
+ dtags->nr_free--;
+
+ tag = dtags->freelist[dtags->nr_free];
+
+ if (dtags->nr_free)
+ cpumask_set_cpu(dtags->cpu, &pt->free_tags);
+
+ spin_unlock_irqrestore_cond(&dtags->lock, dtags->cpu, flags);
+
+ if (!stags->nr_free)
+ cpumask_clear_cpu(stags->cpu, &pt->free_tags);
+
+ spin_unlock_irqrestore_cond(&stags->lock, stags->cpu, flags);
+
+ cpu_to_tag(pt, dtags->cpu, tag);
+
+ return tag;
+}
+
+wait_queue_head_t *
+prepare_to_wait_tag(struct percpu_tags *pt, wait_queue_t *wait, int state)
+{
+ struct percpu_cache *tags;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ tags = this_cpu_ptr(pt->cache);
+ spin_lock(&tags->lock);
+
+ prepare_to_wait(&tags->wait, wait, state);
+ cpumask_set_cpu(tags->cpu, &pt->wait_tags);
+
+ local_irq_restore(flags);
+ spin_unlock(&tags->lock);
+
+ return &tags->wait;
+}
+
+static int
+__alloc_tag_global(struct percpu_tags *pt,
+ int start_cpu, const cpumask_t *cpumask, bool force)
+{
+ struct percpu_cache *this_tags = per_cpu_ptr(pt->cache, start_cpu);
+ struct percpu_cache *remote_tags;
+ int cpu = start_cpu;
+ int n;
+ int tag = -ENOSPC;
+
+ for (n = cpumask_weight(cpumask); n; n--) {
+ cpu = cpumask_next(cpu, cpumask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpumask);
+ if (cpu >= nr_cpu_ids)
+ break;
+
+ if (cpu == start_cpu)
+ continue;
+
+ remote_tags = per_cpu_ptr(pt->cache, cpu);
+ tag = pull_tag(pt, this_tags, remote_tags, force);
+ if (tag >= 0)
+ break;
+ }
+
+ return tag;
+}
+
+static int alloc_tag_global(struct percpu_tags *pt)
+{
+ int this_cpu = smp_processor_id();
+ int tag;
+
+ tag = __alloc_tag_global(pt, this_cpu, &pt->free_tags, false);
+ if (tag < 0)
+ tag = __alloc_tag_global(pt, this_cpu, &pt->free_tags, true);
+
+ return tag;
+}
+
+int percpu_tags_alloc(struct percpu_tags *pt, int state)
+{
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wq = NULL;
+ int tag = -ENOSPC;
+
+ for (;;) {
+ tag = alloc_tag_local(pt);
+ if (tag >= 0)
+ break;
+
+ tag = alloc_tag_global(pt);
+ if (tag >= 0)
+ break;
+
+ if (state == TASK_RUNNING)
+ break;
+
+ wq = prepare_to_wait_tag(pt, &wait, state);
+
+ schedule();
+
+ if (signal_pending_state(state, current)) {
+ tag = -ERESTARTSYS;
+ break;
+ }
+ }
+
+ if (wq)
+ finish_wait(wq, &wait);
+
+ return tag;
+}
+EXPORT_SYMBOL_GPL(percpu_tags_alloc);
+
+static bool __free_tag(struct percpu_tags *pt,
+ struct percpu_cache *tags, int tag, bool force)
+{
+ int nr_cache = cache_size(pt);
+ int cpu = tags->cpu;
+ unsigned long uninitialized_var(flags);
+ bool ret;
+
+ spin_lock_irqsave_cond(&tags->lock, cpu, flags);
+
+ BUG_ON(tags->nr_free >= pt->nr_tags);
+ if (force || (tags->nr_free + tags->nr_alloc < nr_cache)) {
+ tags->freelist[tags->nr_free] = tag;
+ tags->nr_free++;
+ cpumask_set_cpu(cpu, &pt->free_tags);
+ ret = true;
+ } else {
+ ret = false;
+ }
+
+ spin_unlock_irqrestore_cond(&tags->lock, cpu, flags);
+
+ return ret;
+}
+
+static int free_tag(struct percpu_tags *pt, struct percpu_cache *tags, int tag)
+{
+ return __free_tag(pt, tags, tag, false);
+}
+
+static int push_tag(struct percpu_tags *pt, struct percpu_cache *tags, int tag)
+{
+ return __free_tag(pt, tags, tag, true);
+}
+
+static void dealloc_tag(struct percpu_tags *pt, int cpu, int tag)
+{
+ struct percpu_cache *tags = per_cpu_ptr(pt->cache, cpu);
+ unsigned long uninitialized_var(flags);
+
+ spin_lock_irqsave_cond(&tags->lock, cpu, flags);
+
+ BUG_ON(tags->nr_alloc < 1);
+ tags->nr_alloc--;
+ if (!tags->nr_alloc)
+ cpumask_clear_cpu(tags->cpu, &pt->alloc_tags);
+
+ spin_unlock_irqrestore_cond(&tags->lock, cpu, flags);
+}
+
+static int free_tag_scarce(struct percpu_tags *pt, int start_cpu, int tag)
+{
+ struct percpu_cache *tags;
+ int cpu;
+
+ cpu = cpumask_next(start_cpu, &pt->wait_tags);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(&pt->wait_tags);
+
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_next(start_cpu, &pt->alloc_tags);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(&pt->alloc_tags);
+
+ if (cpu >= nr_cpu_ids)
+ cpu = start_cpu;
+
+ tags = per_cpu_ptr(pt->cache, cpu);
+ push_tag(pt, tags, tag);
+
+ return cpu;
+}
+
+static int __free_tag_normal(struct percpu_tags *pt,
+ int start_cpu, int tag, const cpumask_t *cpumask)
+{
+ struct percpu_cache *tags;
+ int cpu;
+ int n;
+
+ tags = per_cpu_ptr(pt->cache, start_cpu);
+ if (free_tag(pt, tags, tag))
+ return start_cpu;
+
+ cpu = nr_cpu_ids;
+
+ for (n = cpumask_weight(cpumask); n; n--) {
+ cpu = cpumask_next(cpu, cpumask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpumask);
+ if (cpu >= nr_cpu_ids)
+ break;
+
+ if (cpu == start_cpu)
+ continue;
+
+ tags = per_cpu_ptr(pt->cache, cpu);
+ if (free_tag(pt, tags, tag))
+ break;
+ }
+
+ return cpu;
+}
+
+static int free_tag_normal(struct percpu_tags *pt, int start_cpu, int tag)
+{
+ struct percpu_cache *tags;
+ int cpu;
+
+ cpu = __free_tag_normal(pt, start_cpu, tag, &pt->alloc_tags);
+
+ if (cpu >= nr_cpu_ids)
+ cpu = __free_tag_normal(pt, start_cpu, tag, &pt->wait_tags);
+
+ if (cpu >= nr_cpu_ids) {
+ cpu = start_cpu;
+ tags = per_cpu_ptr(pt->cache, cpu);
+ push_tag(pt, tags, tag);
+ }
+
+ return cpu;
+}
+
+static bool wake_on_cpu(struct percpu_tags *pt, int cpu)
+{
+ struct percpu_cache *tags = per_cpu_ptr(pt->cache, cpu);
+ int wq_active;
+ unsigned long uninitialized_var(flags);
+
+ spin_lock_irqsave_cond(&tags->lock, cpu, flags);
+ wq_active = waitqueue_active(&tags->wait);
+ if (!wq_active)
+ cpumask_clear_cpu(cpu, &pt->wait_tags);
+ spin_unlock_irqrestore_cond(&tags->lock, cpu, flags);
+
+ if (wq_active)
+ wake_up(&tags->wait);
+
+ return wq_active;
+}
+
+void percpu_tags_free(struct percpu_tags *pt, int tag)
+{
+ bool scarce = pt->nr_tags < cpumask_weight(cpu_online_mask);
+ int cpu, alloc_cpu;
+ int n;
+
+ alloc_cpu = cpu_from_tag(pt, tag);
+ dealloc_tag(pt, alloc_cpu, tag);
+
+ if (scarce)
+ cpu = free_tag_scarce(pt, alloc_cpu, tag);
+ else
+ cpu = free_tag_normal(pt, alloc_cpu, tag);
+
+ if (wake_on_cpu(pt, cpu))
+ return;
+
+ for (n = cpumask_weight(&pt->wait_tags); n; n--) {
+ cpu = cpumask_next(cpu, &pt->wait_tags);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(&pt->wait_tags);
+ if (cpu >= nr_cpu_ids)
+ break;
+
+ if (wake_on_cpu(pt, cpu))
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(percpu_tags_free);
+
+int percpu_tags_init(struct percpu_tags *pt, int nr_tags)
+{
+ struct percpu_cache *tags;
+ int order;
+ int cpu;
+ int i;
+
+ order = get_order(nr_tags * sizeof(pt->tag_cpu_map[0]));
+ pt->tag_cpu_map = (int*)__get_free_pages(GFP_KERNEL, order);
+ if (!pt->tag_cpu_map)
+ return -ENOMEM;
+
+ pt->cache = __alloc_percpu(sizeof(*pt->cache) +
+ nr_tags * sizeof(pt->cache->freelist[0]),
+ sizeof(pt->cache->freelist[0]));
+ if (!pt->cache) {
+ free_pages((unsigned long)pt->tag_cpu_map, order);
+ return -ENOMEM;
+ }
+
+ pt->nr_tags = nr_tags;
+
+ for_each_possible_cpu(cpu) {
+ tags = per_cpu_ptr(pt->cache, cpu);
+ tags->cpu = cpu;
+ spin_lock_init(&tags->lock);
+ init_waitqueue_head(&tags->wait);
+ }
+
+ tags = this_cpu_ptr(pt->cache);
+ for (i = 0; i < nr_tags; i++)
+ tags->freelist[i] = i;
+ tags->nr_free = nr_tags;
+ cpumask_set_cpu(tags->cpu, &pt->free_tags);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(percpu_tags_init);
+
+void percpu_tags_destroy(struct percpu_tags *pt)
+{
+ int order = get_order(pt->nr_tags * sizeof(pt->tag_cpu_map[0]));
+ free_pages((unsigned long)pt->tag_cpu_map, order);
+ free_percpu(pt->cache);
+}
+EXPORT_SYMBOL_GPL(percpu_tags_destroy);
+
+int percpu_tags_for_each_free(struct percpu_tags *pt,
+ int (*fn)(unsigned, void *), void *data)
+{
+ unsigned long flags;
+ struct percpu_cache *remote;
+ unsigned cpu, i, err = 0;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+ remote = per_cpu_ptr(pt->cache, cpu);
+ spin_lock(&remote->lock);
+ for (i = 0; i < remote->nr_free; i++) {
+ err = fn(remote->freelist[i], data);
+ if (err)
+ break;
+ }
+ spin_unlock(&remote->lock);
+ if (err)
+ goto out;
+ }
+
+out:
+ local_irq_restore(flags);
+ return err;
+}
+EXPORT_SYMBOL_GPL(percpu_tags_for_each_free);
+
+int percpu_tags_free_tags(struct percpu_tags *pt, int cpu)
+{
+ struct percpu_cache *remote;
+ if (cpu >= nr_cpu_ids)
+ return 0;
+ remote = per_cpu_ptr(pt->cache, cpu);
+ return remote->nr_free;
+}
+EXPORT_SYMBOL_GPL(percpu_tags_free_tags);
--
1.7.7.6