Hi Bruce!
Here are the patches that I had mentioned earlier that reduce the
contention for the pool->sp_lock when the server is heavily loaded.
The basic problem is that whenever a svc_xprt needs to be queued up for
servicing, we have to take the pool->sp_lock to try and find an idle
thread to service it. On a busy server, that lock becomes highly
contended and that limits the throughput.
This patchset fixes this by changing how we search for an idle thread.
First, we convert svc_rqst and the sp_all_threads list to be
RCU-managed. Then we change the search for an idle thread to use the
sp_all_threads list, which now can be done under the rcu_read_lock.
When there is an available thread, queueing an xprt to it can now be
done without any spinlocking.
With this, we see a pretty substantial increase in performance on a
larger-scale server that is heavily loaded. Chris has some preliminary
numbers, but they need to be cleaned up a bit before we can present
them. I'm hoping to have those by early next week.
Jeff Layton (4):
sunrpc: add a rcu_head to svc_rqst and use kfree_rcu to free it
sunrpc: fix potential races in pool_stats collection
sunrpc: convert to lockless lookup of queued server threads
sunrpc: add some tracepoints around enqueue and dequeue of svc_xprt
include/linux/sunrpc/svc.h | 12 +-
include/trace/events/sunrpc.h | 98 +++++++++++++++-
net/sunrpc/svc.c | 17 +--
net/sunrpc/svc_xprt.c | 252 ++++++++++++++++++++++++------------------
4 files changed, 258 insertions(+), 121 deletions(-)
--
2.1.0
...also make the manipulation of sp_all_threads list use RCU-friendly
functions.
Signed-off-by: Jeff Layton <[email protected]>
Tested-by: Chris Worley <[email protected]>
---
include/linux/sunrpc/svc.h | 2 ++
include/trace/events/sunrpc.h | 3 ++-
net/sunrpc/svc.c | 10 ++++++----
3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5f0ab39bf7c3..7f80a99c59e4 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -223,6 +223,7 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
struct svc_rqst {
struct list_head rq_list; /* idle list */
struct list_head rq_all; /* all threads list */
+ struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
struct svc_xprt * rq_xprt; /* transport ptr */
struct sockaddr_storage rq_addr; /* peer address */
@@ -262,6 +263,7 @@ struct svc_rqst {
#define RQ_SPLICE_OK (4) /* turned off in gss privacy
* to prevent encrypting page
* cache pages */
+#define RQ_VICTIM (5) /* about to be shut down */
unsigned long rq_flags; /* flags field */
void * rq_argp; /* decoded arguments */
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 5848fc235869..08a5fed50f34 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -418,7 +418,8 @@ TRACE_EVENT(xs_tcp_data_recv,
{ (1UL << RQ_LOCAL), "RQ_LOCAL"}, \
{ (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
{ (1UL << RQ_DROPME), "RQ_DROPME"}, \
- { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"})
+ { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
+ { (1UL << RQ_VICTIM), "RQ_VICTIM"})
TRACE_EVENT(svc_recv,
TP_PROTO(struct svc_rqst *rqst, int status),
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5d9a443d21f6..4edef32f3b9f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -616,7 +616,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
serv->sv_nrthreads++;
spin_lock_bh(&pool->sp_lock);
pool->sp_nrthreads++;
- list_add(&rqstp->rq_all, &pool->sp_all_threads);
+ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
spin_unlock_bh(&pool->sp_lock);
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
@@ -684,7 +684,8 @@ found_pool:
* so we don't try to kill it again.
*/
rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
- list_del_init(&rqstp->rq_all);
+ set_bit(RQ_VICTIM, &rqstp->rq_flags);
+ list_del_rcu(&rqstp->rq_all);
task = rqstp->rq_task;
}
spin_unlock_bh(&pool->sp_lock);
@@ -782,10 +783,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
spin_lock_bh(&pool->sp_lock);
pool->sp_nrthreads--;
- list_del(&rqstp->rq_all);
+ if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
+ list_del_rcu(&rqstp->rq_all);
spin_unlock_bh(&pool->sp_lock);
- kfree(rqstp);
+ kfree_rcu(rqstp, rq_rcu_head);
/* Release the server */
if (serv)
--
2.1.0
In a later patch, we'll be removing some spinlocking around the socket
and thread queueing code in order to fix some contention problems. At
that point, the stats counters will no longer be protected by the
sp_lock.
Change the counters to atomic_long_t fields, except for the
"sockets_queued" counter which will still be manipulated under a
spinlock.
Signed-off-by: Jeff Layton <[email protected]>
Tested-by: Chris Worley <[email protected]>
---
include/linux/sunrpc/svc.h | 6 +++---
net/sunrpc/svc_xprt.c | 12 ++++++------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 7f80a99c59e4..513957eba0a5 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -26,10 +26,10 @@ typedef int (*svc_thread_fn)(void *);
/* statistics for svc_pool structures */
struct svc_pool_stats {
- unsigned long packets;
+ atomic_long_t packets;
unsigned long sockets_queued;
- unsigned long threads_woken;
- unsigned long threads_timedout;
+ atomic_long_t threads_woken;
+ atomic_long_t threads_timedout;
};
/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b2676e597fc4..579ff2249562 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -362,7 +362,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
spin_lock_bh(&pool->sp_lock);
- pool->sp_stats.packets++;
+ atomic_long_inc(&pool->sp_stats.packets);
if (!list_empty(&pool->sp_threads)) {
rqstp = list_entry(pool->sp_threads.next,
@@ -383,7 +383,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
svc_xprt_get(xprt);
wake_up_process(rqstp->rq_task);
rqstp->rq_xprt = xprt;
- pool->sp_stats.threads_woken++;
+ atomic_long_inc(&pool->sp_stats.threads_woken);
} else {
dprintk("svc: transport %p put into queue\n", xprt);
list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
@@ -669,7 +669,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
spin_lock_bh(&pool->sp_lock);
if (!time_left)
- pool->sp_stats.threads_timedout++;
+ atomic_long_inc(&pool->sp_stats.threads_timedout);
xprt = rqstp->rq_xprt;
if (!xprt) {
@@ -1306,10 +1306,10 @@ static int svc_pool_stats_show(struct seq_file *m, void *p)
seq_printf(m, "%u %lu %lu %lu %lu\n",
pool->sp_id,
- pool->sp_stats.packets,
+ (unsigned long)atomic_long_read(&pool->sp_stats.packets),
pool->sp_stats.sockets_queued,
- pool->sp_stats.threads_woken,
- pool->sp_stats.threads_timedout);
+ (unsigned long)atomic_long_read(&pool->sp_stats.threads_woken),
+ (unsigned long)atomic_long_read(&pool->sp_stats.threads_timedout));
return 0;
}
--
2.1.0
Testing has shown that the pool->sp_lock can be a bottleneck on a busy
server. Every time data is received on a socket, the server must take
that lock in order to dequeue a thread from the sp_threads list.
Address this problem by eliminating the sp_threads list (which contains
threads that are currently idle) and replacing it with a RQ_BUSY flag in
svc_rqst. This allows us to walk the sp_all_threads list under the
rcu_read_lock and find a suitable thread for the xprt by doing a
test_and_set_bit.
Note that we do still have a potential atomicity problem however with
this approach. We don't want svc_xprt_do_enqueue to set the
rqst->rq_xprt pointer unless a test_and_set_bit of RQ_BUSY returned
negative (which indicates that the thread was idle). But, by the time we
check that, the big could be flipped by a waking thread.
To address this, we acquire a new per-rqst spinlock (rq_lock) and take
that before doing the test_and_set_bit. If that returns false, then we
can set rq_xprt and drop the spinlock. Then, when the thread wakes up,
it must set the bit under the same spinlock and can trust that if it was
already set then the rq_xprt is also properly set.
With this scheme, the case where we have an idle thread no longer needs
to take the highly contended pool->sp_lock at all, and that removes the
bottleneck.
That still leaves one issue: What of the case where we walk the whole
sp_all_threads list and don't find an idle thread? Because the search is
lockess, it's possible for the queueing to race with a thread that is
going to sleep. To address that, we queue the xprt and then search again.
If we find an idle thread at that point, we can't attach the xprt to it
directly since that might race with a different thread waking up and
finding it. All we can do is wake the idle thread back up and let it
attempt to find the now-queued xprt.
Signed-off-by: Jeff Layton <[email protected]>
Tested-by: Chris Worley <[email protected]>
---
include/linux/sunrpc/svc.h | 4 +-
include/trace/events/sunrpc.h | 3 +-
net/sunrpc/svc.c | 7 +-
net/sunrpc/svc_xprt.c | 221 ++++++++++++++++++++++++------------------
4 files changed, 132 insertions(+), 103 deletions(-)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 513957eba0a5..6f22cfeef5e3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -45,7 +45,6 @@ struct svc_pool_stats {
struct svc_pool {
unsigned int sp_id; /* pool id; also node id on NUMA */
spinlock_t sp_lock; /* protects all fields */
- struct list_head sp_threads; /* idle server threads */
struct list_head sp_sockets; /* pending sockets */
unsigned int sp_nrthreads; /* # of threads in pool */
struct list_head sp_all_threads; /* all server threads */
@@ -221,7 +220,6 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
* processed.
*/
struct svc_rqst {
- struct list_head rq_list; /* idle list */
struct list_head rq_all; /* all threads list */
struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
struct svc_xprt * rq_xprt; /* transport ptr */
@@ -264,6 +262,7 @@ struct svc_rqst {
* to prevent encrypting page
* cache pages */
#define RQ_VICTIM (5) /* about to be shut down */
+#define RQ_BUSY (6) /* request is busy */
unsigned long rq_flags; /* flags field */
void * rq_argp; /* decoded arguments */
@@ -285,6 +284,7 @@ struct svc_rqst {
struct auth_domain * rq_gssclient; /* "gss/"-style peer info */
struct svc_cacherep * rq_cacherep; /* cache info */
struct task_struct *rq_task; /* service thread */
+ spinlock_t rq_lock; /* per-request lock */
};
#define SVC_NET(svc_rqst) (svc_rqst->rq_xprt->xpt_net)
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 08a5fed50f34..ee4438a63a48 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -419,7 +419,8 @@ TRACE_EVENT(xs_tcp_data_recv,
{ (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
{ (1UL << RQ_DROPME), "RQ_DROPME"}, \
{ (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
- { (1UL << RQ_VICTIM), "RQ_VICTIM"})
+ { (1UL << RQ_VICTIM), "RQ_VICTIM"}, \
+ { (1UL << RQ_BUSY), "RQ_BUSY"})
TRACE_EVENT(svc_recv,
TP_PROTO(struct svc_rqst *rqst, int status),
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4edef32f3b9f..4308881d9d0a 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -476,7 +476,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
i, serv->sv_name);
pool->sp_id = i;
- INIT_LIST_HEAD(&pool->sp_threads);
INIT_LIST_HEAD(&pool->sp_sockets);
INIT_LIST_HEAD(&pool->sp_all_threads);
spin_lock_init(&pool->sp_lock);
@@ -614,12 +613,14 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
goto out_enomem;
serv->sv_nrthreads++;
+ __set_bit(RQ_BUSY, &rqstp->rq_flags);
+ spin_lock_init(&rqstp->rq_lock);
+ rqstp->rq_server = serv;
+ rqstp->rq_pool = pool;
spin_lock_bh(&pool->sp_lock);
pool->sp_nrthreads++;
list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
spin_unlock_bh(&pool->sp_lock);
- rqstp->rq_server = serv;
- rqstp->rq_pool = pool;
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
if (!rqstp->rq_argp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 579ff2249562..ed90d955f733 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -310,25 +310,6 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
}
EXPORT_SYMBOL_GPL(svc_print_addr);
-/*
- * Queue up an idle server thread. Must have pool->sp_lock held.
- * Note: this is really a stack rather than a queue, so that we only
- * use as many different threads as we need, and the rest don't pollute
- * the cache.
- */
-static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
- list_add(&rqstp->rq_list, &pool->sp_threads);
-}
-
-/*
- * Dequeue an nfsd thread. Must have pool->sp_lock held.
- */
-static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
- list_del(&rqstp->rq_list);
-}
-
static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
{
if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
@@ -343,6 +324,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
struct svc_pool *pool;
struct svc_rqst *rqstp;
int cpu;
+ bool queued = false;
if (!svc_xprt_has_something_to_do(xprt))
return;
@@ -360,37 +342,60 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
cpu = get_cpu();
pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
- spin_lock_bh(&pool->sp_lock);
atomic_long_inc(&pool->sp_stats.packets);
- if (!list_empty(&pool->sp_threads)) {
- rqstp = list_entry(pool->sp_threads.next,
- struct svc_rqst,
- rq_list);
- dprintk("svc: transport %p served by daemon %p\n",
- xprt, rqstp);
- svc_thread_dequeue(pool, rqstp);
- if (rqstp->rq_xprt)
- printk(KERN_ERR
- "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
- rqstp, rqstp->rq_xprt);
- /* Note the order of the following 3 lines:
- * We want to assign xprt to rqstp->rq_xprt only _after_
- * we've woken up the process, so that we don't race with
- * the lockless check in svc_get_next_xprt().
+redo_search:
+ /* find a thread for this xprt */
+ rcu_read_lock();
+ list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
+ /* Do a lockless check first */
+ if (test_bit(RQ_BUSY, &rqstp->rq_flags))
+ continue;
+
+ /*
+ * Once the xprt has been queued, it can only be dequeued by
+ * the task that intends to service it. All we can do at that
+ * point is to try to wake this thread back up so that it can
+ * do so.
*/
- svc_xprt_get(xprt);
- wake_up_process(rqstp->rq_task);
- rqstp->rq_xprt = xprt;
+ if (!queued) {
+ spin_lock_bh(&rqstp->rq_lock);
+ if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) {
+ /* already busy, move on... */
+ spin_unlock_bh(&rqstp->rq_lock);
+ continue;
+ }
+
+ /* this one will do */
+ rqstp->rq_xprt = xprt;
+ svc_xprt_get(xprt);
+ spin_unlock_bh(&rqstp->rq_lock);
+ }
+ rcu_read_unlock();
+
atomic_long_inc(&pool->sp_stats.threads_woken);
- } else {
+ wake_up_process(rqstp->rq_task);
+ put_cpu();
+ return;
+ }
+ rcu_read_unlock();
+
+ /*
+ * We didn't find an idle thread to use, so we need to queue the xprt.
+ * Do so and then search again. If we find one, we can't hook this one
+ * up to it directly but we can wake the thread up in the hopes that it
+ * will pick it up once it searches for a xprt to service.
+ */
+ if (!queued) {
+ queued = true;
dprintk("svc: transport %p put into queue\n", xprt);
+ spin_lock_bh(&pool->sp_lock);
list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
pool->sp_stats.sockets_queued++;
+ spin_unlock_bh(&pool->sp_lock);
+ goto redo_search;
}
-
- spin_unlock_bh(&pool->sp_lock);
put_cpu();
}
@@ -408,21 +413,26 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
/*
- * Dequeue the first transport. Must be called with the pool->sp_lock held.
+ * Dequeue the first transport, if there is one.
*/
static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
{
- struct svc_xprt *xprt;
+ struct svc_xprt *xprt = NULL;
if (list_empty(&pool->sp_sockets))
return NULL;
- xprt = list_entry(pool->sp_sockets.next,
- struct svc_xprt, xpt_ready);
- list_del_init(&xprt->xpt_ready);
+ spin_lock_bh(&pool->sp_lock);
+ if (likely(!list_empty(&pool->sp_sockets))) {
+ xprt = list_first_entry(&pool->sp_sockets,
+ struct svc_xprt, xpt_ready);
+ list_del_init(&xprt->xpt_ready);
+ svc_xprt_get(xprt);
- dprintk("svc: transport %p dequeued, inuse=%d\n",
- xprt, atomic_read(&xprt->xpt_ref.refcount));
+ dprintk("svc: transport %p dequeued, inuse=%d\n",
+ xprt, atomic_read(&xprt->xpt_ref.refcount));
+ }
+ spin_unlock_bh(&pool->sp_lock);
return xprt;
}
@@ -497,16 +507,21 @@ void svc_wake_up(struct svc_serv *serv)
pool = &serv->sv_pools[0];
- spin_lock_bh(&pool->sp_lock);
- if (!list_empty(&pool->sp_threads)) {
- rqstp = list_entry(pool->sp_threads.next,
- struct svc_rqst,
- rq_list);
+ rcu_read_lock();
+ list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
+ /* skip any that aren't queued */
+ if (test_bit(RQ_BUSY, &rqstp->rq_flags))
+ continue;
+ rcu_read_unlock();
dprintk("svc: daemon %p woken up.\n", rqstp);
wake_up_process(rqstp->rq_task);
- } else
- set_bit(SP_TASK_PENDING, &pool->sp_flags);
- spin_unlock_bh(&pool->sp_lock);
+ return;
+ }
+ rcu_read_unlock();
+
+ /* No free entries available */
+ set_bit(SP_TASK_PENDING, &pool->sp_flags);
+ smp_wmb();
}
EXPORT_SYMBOL_GPL(svc_wake_up);
@@ -617,22 +632,47 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
return 0;
}
+static bool
+rqst_should_sleep(struct svc_rqst *rqstp)
+{
+ struct svc_pool *pool = rqstp->rq_pool;
+
+ /* did someone call svc_wake_up? */
+ if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags))
+ return false;
+
+ /* was a socket queued? */
+ if (!list_empty(&pool->sp_sockets))
+ return false;
+
+ /* are we shutting down? */
+ if (signalled() || kthread_should_stop())
+ return false;
+
+ /* are we freezing? */
+ if (freezing(current))
+ return false;
+
+ return true;
+}
+
static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
{
struct svc_xprt *xprt;
struct svc_pool *pool = rqstp->rq_pool;
long time_left = 0;
+ /* rq_xprt should be clear on entry */
+ WARN_ON_ONCE(rqstp->rq_xprt);
+
/* Normally we will wait up to 5 seconds for any required
* cache information to be provided.
*/
rqstp->rq_chandle.thread_wait = 5*HZ;
- spin_lock_bh(&pool->sp_lock);
xprt = svc_xprt_dequeue(pool);
if (xprt) {
rqstp->rq_xprt = xprt;
- svc_xprt_get(xprt);
/* As there is a shortage of threads and this request
* had to be queued, don't allow the thread to wait so
@@ -640,51 +680,38 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
*/
rqstp->rq_chandle.thread_wait = 1*HZ;
clear_bit(SP_TASK_PENDING, &pool->sp_flags);
- } else {
- if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) {
- xprt = ERR_PTR(-EAGAIN);
- goto out;
- }
- /*
- * We have to be able to interrupt this wait
- * to bring down the daemons ...
- */
- set_current_state(TASK_INTERRUPTIBLE);
+ return xprt;
+ }
- /* No data pending. Go to sleep */
- svc_thread_enqueue(pool, rqstp);
- spin_unlock_bh(&pool->sp_lock);
+ /*
+ * We have to be able to interrupt this wait
+ * to bring down the daemons ...
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ clear_bit(RQ_BUSY, &rqstp->rq_flags);
+ smp_mb();
+
+ if (likely(rqst_should_sleep(rqstp)))
+ time_left = schedule_timeout(timeout);
+ else
+ __set_current_state(TASK_RUNNING);
- if (!(signalled() || kthread_should_stop())) {
- time_left = schedule_timeout(timeout);
- __set_current_state(TASK_RUNNING);
+ try_to_freeze();
- try_to_freeze();
+ spin_lock_bh(&rqstp->rq_lock);
+ set_bit(RQ_BUSY, &rqstp->rq_flags);
+ spin_unlock_bh(&rqstp->rq_lock);
- xprt = rqstp->rq_xprt;
- if (xprt != NULL)
- return xprt;
- } else
- __set_current_state(TASK_RUNNING);
+ xprt = rqstp->rq_xprt;
+ if (xprt != NULL)
+ return xprt;
- spin_lock_bh(&pool->sp_lock);
- if (!time_left)
- atomic_long_inc(&pool->sp_stats.threads_timedout);
+ if (!time_left)
+ atomic_long_inc(&pool->sp_stats.threads_timedout);
- xprt = rqstp->rq_xprt;
- if (!xprt) {
- svc_thread_dequeue(pool, rqstp);
- spin_unlock_bh(&pool->sp_lock);
- dprintk("svc: server %p, no data yet\n", rqstp);
- if (signalled() || kthread_should_stop())
- return ERR_PTR(-EINTR);
- else
- return ERR_PTR(-EAGAIN);
- }
- }
-out:
- spin_unlock_bh(&pool->sp_lock);
- return xprt;
+ if (signalled() || kthread_should_stop())
+ return ERR_PTR(-EINTR);
+ return ERR_PTR(-EAGAIN);
}
static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
--
2.1.0
These were useful when I was tracking down a race condition between
svc_xprt_do_enqueue and svc_get_next_xprt.
Signed-off-by: Jeff Layton <[email protected]>
---
include/trace/events/sunrpc.h | 94 +++++++++++++++++++++++++++++++++++++++++++
net/sunrpc/svc_xprt.c | 23 +++++++----
2 files changed, 110 insertions(+), 7 deletions(-)
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index ee4438a63a48..b9c1dc6c825a 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -8,6 +8,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <net/tcp_states.h>
#include <linux/net.h>
#include <linux/tracepoint.h>
@@ -480,6 +481,99 @@ DEFINE_EVENT(svc_rqst_status, svc_send,
TP_PROTO(struct svc_rqst *rqst, int status),
TP_ARGS(rqst, status));
+#define show_svc_xprt_flags(flags) \
+ __print_flags(flags, "|", \
+ { (1UL << XPT_BUSY), "XPT_BUSY"}, \
+ { (1UL << XPT_CONN), "XPT_CONN"}, \
+ { (1UL << XPT_CLOSE), "XPT_CLOSE"}, \
+ { (1UL << XPT_DATA), "XPT_DATA"}, \
+ { (1UL << XPT_TEMP), "XPT_TEMP"}, \
+ { (1UL << XPT_DEAD), "XPT_DEAD"}, \
+ { (1UL << XPT_CHNGBUF), "XPT_CHNGBUF"}, \
+ { (1UL << XPT_DEFERRED), "XPT_DEFERRED"}, \
+ { (1UL << XPT_OLD), "XPT_OLD"}, \
+ { (1UL << XPT_LISTENER), "XPT_LISTENER"}, \
+ { (1UL << XPT_CACHE_AUTH), "XPT_CACHE_AUTH"}, \
+ { (1UL << XPT_LOCAL), "XPT_LOCAL"})
+
+TRACE_EVENT(svc_xprt_do_enqueue,
+ TP_PROTO(struct svc_xprt *xprt, struct svc_rqst *rqst),
+
+ TP_ARGS(xprt, rqst),
+
+ TP_STRUCT__entry(
+ __field(struct svc_xprt *, xprt)
+ __field(struct svc_rqst *, rqst)
+ ),
+
+ TP_fast_assign(
+ __entry->xprt = xprt;
+ __entry->rqst = rqst;
+ ),
+
+ TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt,
+ (struct sockaddr *)&__entry->xprt->xpt_remote,
+ __entry->rqst ? __entry->rqst->rq_task->pid : 0,
+ show_svc_xprt_flags(__entry->xprt->xpt_flags))
+);
+
+TRACE_EVENT(svc_xprt_dequeue,
+ TP_PROTO(struct svc_xprt *xprt),
+
+ TP_ARGS(xprt),
+
+ TP_STRUCT__entry(
+ __field(struct svc_xprt *, xprt)
+ __field_struct(struct sockaddr_storage, ss)
+ __field(unsigned long, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->xprt = xprt,
+ xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss));
+ __entry->flags = xprt ? xprt->xpt_flags : 0;
+ ),
+
+ TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt,
+ (struct sockaddr *)&__entry->ss,
+ show_svc_xprt_flags(__entry->flags))
+);
+
+TRACE_EVENT(svc_wake_up,
+ TP_PROTO(int pid),
+
+ TP_ARGS(pid),
+
+ TP_STRUCT__entry(
+ __field(int, pid)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = pid;
+ ),
+
+ TP_printk("pid=%d", __entry->pid)
+);
+
+TRACE_EVENT(svc_handle_xprt,
+ TP_PROTO(struct svc_xprt *xprt, int len),
+
+ TP_ARGS(xprt, len),
+
+ TP_STRUCT__entry(
+ __field(struct svc_xprt *, xprt)
+ __field(int, len)
+ ),
+
+ TP_fast_assign(
+ __entry->xprt = xprt;
+ __entry->len = len;
+ ),
+
+ TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt,
+ (struct sockaddr *)&__entry->xprt->xpt_remote, __entry->len,
+ show_svc_xprt_flags(__entry->xprt->xpt_flags))
+);
#endif /* _TRACE_SUNRPC_H */
#include <trace/define_trace.h>
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index ed90d955f733..0ae1d78d934d 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -322,12 +322,12 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
{
struct svc_pool *pool;
- struct svc_rqst *rqstp;
+ struct svc_rqst *rqstp = NULL;
int cpu;
bool queued = false;
if (!svc_xprt_has_something_to_do(xprt))
- return;
+ goto out;
/* Mark transport as busy. It will remain in this state until
* the provider calls svc_xprt_received. We update XPT_BUSY
@@ -337,7 +337,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
/* Don't enqueue transport while already enqueued */
dprintk("svc: transport %p busy, not enqueued\n", xprt);
- return;
+ goto out;
}
cpu = get_cpu();
@@ -377,7 +377,7 @@ redo_search:
atomic_long_inc(&pool->sp_stats.threads_woken);
wake_up_process(rqstp->rq_task);
put_cpu();
- return;
+ goto out;
}
rcu_read_unlock();
@@ -387,6 +387,7 @@ redo_search:
* up to it directly but we can wake the thread up in the hopes that it
* will pick it up once it searches for a xprt to service.
*/
+ dprintk("svc: transport %p put into queue\n", xprt);
if (!queued) {
queued = true;
dprintk("svc: transport %p put into queue\n", xprt);
@@ -396,7 +397,10 @@ redo_search:
spin_unlock_bh(&pool->sp_lock);
goto redo_search;
}
+ rqstp = NULL;
put_cpu();
+out:
+ trace_svc_xprt_do_enqueue(xprt, rqstp);
}
/*
@@ -420,7 +424,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
struct svc_xprt *xprt = NULL;
if (list_empty(&pool->sp_sockets))
- return NULL;
+ goto out;
spin_lock_bh(&pool->sp_lock);
if (likely(!list_empty(&pool->sp_sockets))) {
@@ -433,7 +437,8 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
xprt, atomic_read(&xprt->xpt_ref.refcount));
}
spin_unlock_bh(&pool->sp_lock);
-
+out:
+ trace_svc_xprt_dequeue(xprt);
return xprt;
}
@@ -515,6 +520,7 @@ void svc_wake_up(struct svc_serv *serv)
rcu_read_unlock();
dprintk("svc: daemon %p woken up.\n", rqstp);
wake_up_process(rqstp->rq_task);
+ trace_svc_wake_up(rqstp->rq_task->pid);
return;
}
rcu_read_unlock();
@@ -522,6 +528,7 @@ void svc_wake_up(struct svc_serv *serv)
/* No free entries available */
set_bit(SP_TASK_PENDING, &pool->sp_flags);
smp_wmb();
+ trace_svc_wake_up(0);
}
EXPORT_SYMBOL_GPL(svc_wake_up);
@@ -740,7 +747,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
dprintk("svc_recv: found XPT_CLOSE\n");
svc_delete_xprt(xprt);
/* Leave XPT_BUSY set on the dead xprt: */
- return 0;
+ goto out;
}
if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
struct svc_xprt *newxpt;
@@ -771,6 +778,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
}
/* clear XPT_BUSY: */
svc_xprt_received(xprt);
+out:
+ trace_svc_handle_xprt(xprt, len);
return len;
}
--
2.1.0
On Fri, 21 Nov 2014 14:19:27 -0500
Jeff Layton <[email protected]> wrote:
> Hi Bruce!
>
> Here are the patches that I had mentioned earlier that reduce the
> contention for the pool->sp_lock when the server is heavily loaded.
>
> The basic problem is that whenever a svc_xprt needs to be queued up for
> servicing, we have to take the pool->sp_lock to try and find an idle
> thread to service it. On a busy server, that lock becomes highly
> contended and that limits the throughput.
>
> This patchset fixes this by changing how we search for an idle thread.
> First, we convert svc_rqst and the sp_all_threads list to be
> RCU-managed. Then we change the search for an idle thread to use the
> sp_all_threads list, which now can be done under the rcu_read_lock.
> When there is an available thread, queueing an xprt to it can now be
> done without any spinlocking.
>
> With this, we see a pretty substantial increase in performance on a
> larger-scale server that is heavily loaded. Chris has some preliminary
> numbers, but they need to be cleaned up a bit before we can present
> them. I'm hoping to have those by early next week.
>
> Jeff Layton (4):
> sunrpc: add a rcu_head to svc_rqst and use kfree_rcu to free it
> sunrpc: fix potential races in pool_stats collection
> sunrpc: convert to lockless lookup of queued server threads
> sunrpc: add some tracepoints around enqueue and dequeue of svc_xprt
>
> include/linux/sunrpc/svc.h | 12 +-
> include/trace/events/sunrpc.h | 98 +++++++++++++++-
> net/sunrpc/svc.c | 17 +--
> net/sunrpc/svc_xprt.c | 252 ++++++++++++++++++++++++------------------
> 4 files changed, 258 insertions(+), 121 deletions(-)
>
Here's what I've got so far.
This is just a chart that shows the % increase in the number of iops in
a distributed test on a NFSv3 server with this patchset vs. without.
The numbers along the bottom show the number of total job threads
running. Chris says:
"There were 64 nfsd threads running on the server.
There were 7 hypervisors running 2 VMs each running 2 and 4 threads per
VM. Thus, 56 and 112 threads total."
Cheers!
--
Jeff Layton <[email protected]>
On Tue, Nov 25, 2014 at 04:25:57PM -0500, Jeff Layton wrote:
> On Fri, 21 Nov 2014 14:19:27 -0500
> Jeff Layton <[email protected]> wrote:
>
> > Hi Bruce!
> >
> > Here are the patches that I had mentioned earlier that reduce the
> > contention for the pool->sp_lock when the server is heavily loaded.
> >
> > The basic problem is that whenever a svc_xprt needs to be queued up for
> > servicing, we have to take the pool->sp_lock to try and find an idle
> > thread to service it. On a busy server, that lock becomes highly
> > contended and that limits the throughput.
> >
> > This patchset fixes this by changing how we search for an idle thread.
> > First, we convert svc_rqst and the sp_all_threads list to be
> > RCU-managed. Then we change the search for an idle thread to use the
> > sp_all_threads list, which now can be done under the rcu_read_lock.
> > When there is an available thread, queueing an xprt to it can now be
> > done without any spinlocking.
> >
> > With this, we see a pretty substantial increase in performance on a
> > larger-scale server that is heavily loaded. Chris has some preliminary
> > numbers, but they need to be cleaned up a bit before we can present
> > them. I'm hoping to have those by early next week.
> >
> > Jeff Layton (4):
> > sunrpc: add a rcu_head to svc_rqst and use kfree_rcu to free it
> > sunrpc: fix potential races in pool_stats collection
> > sunrpc: convert to lockless lookup of queued server threads
> > sunrpc: add some tracepoints around enqueue and dequeue of svc_xprt
> >
> > include/linux/sunrpc/svc.h | 12 +-
> > include/trace/events/sunrpc.h | 98 +++++++++++++++-
> > net/sunrpc/svc.c | 17 +--
> > net/sunrpc/svc_xprt.c | 252 ++++++++++++++++++++++++------------------
> > 4 files changed, 258 insertions(+), 121 deletions(-)
> >
>
> Here's what I've got so far.
>
> This is just a chart that shows the % increase in the number of iops in
> a distributed test on a NFSv3 server with this patchset vs. without.
>
> The numbers along the bottom show the number of total job threads
> running. Chris says:
>
> "There were 64 nfsd threads running on the server.
>
> There were 7 hypervisors running 2 VMs each running 2 and 4 threads per
> VM. Thus, 56 and 112 threads total."
Thanks!
Results that someone else could reproduce would be much better.
(Where's the source code for the test? What's the base the patchset was
applied to? What was the hardware? I understand that's a lot of
information.) But it's nice to see some numbers at least.
(I wonder what the reason is for the odd shape in the 112-thread case
(descending slightly as the writes decrease and then shooting up when
they go to zero.) OK, I guess that's what you get if you just assume
read-write contention is expensive and one write is slightly more
expensive than one read. But then why doesn't it behave the same way in
the 56-thread case?)
--b.
>
> Cheers!
> --
> Jeff Layton <[email protected]>
On Tue, 25 Nov 2014 19:09:41 -0500
"J. Bruce Fields" <[email protected]> wrote:
> On Tue, Nov 25, 2014 at 04:25:57PM -0500, Jeff Layton wrote:
> > On Fri, 21 Nov 2014 14:19:27 -0500
> > Jeff Layton <[email protected]> wrote:
> >
> > > Hi Bruce!
> > >
> > > Here are the patches that I had mentioned earlier that reduce the
> > > contention for the pool->sp_lock when the server is heavily loaded.
> > >
> > > The basic problem is that whenever a svc_xprt needs to be queued up for
> > > servicing, we have to take the pool->sp_lock to try and find an idle
> > > thread to service it. On a busy server, that lock becomes highly
> > > contended and that limits the throughput.
> > >
> > > This patchset fixes this by changing how we search for an idle thread.
> > > First, we convert svc_rqst and the sp_all_threads list to be
> > > RCU-managed. Then we change the search for an idle thread to use the
> > > sp_all_threads list, which now can be done under the rcu_read_lock.
> > > When there is an available thread, queueing an xprt to it can now be
> > > done without any spinlocking.
> > >
> > > With this, we see a pretty substantial increase in performance on a
> > > larger-scale server that is heavily loaded. Chris has some preliminary
> > > numbers, but they need to be cleaned up a bit before we can present
> > > them. I'm hoping to have those by early next week.
> > >
> > > Jeff Layton (4):
> > > sunrpc: add a rcu_head to svc_rqst and use kfree_rcu to free it
> > > sunrpc: fix potential races in pool_stats collection
> > > sunrpc: convert to lockless lookup of queued server threads
> > > sunrpc: add some tracepoints around enqueue and dequeue of svc_xprt
> > >
> > > include/linux/sunrpc/svc.h | 12 +-
> > > include/trace/events/sunrpc.h | 98 +++++++++++++++-
> > > net/sunrpc/svc.c | 17 +--
> > > net/sunrpc/svc_xprt.c | 252 ++++++++++++++++++++++++------------------
> > > 4 files changed, 258 insertions(+), 121 deletions(-)
> > >
> >
> > Here's what I've got so far.
> >
> > This is just a chart that shows the % increase in the number of iops in
> > a distributed test on a NFSv3 server with this patchset vs. without.
> >
> > The numbers along the bottom show the number of total job threads
> > running. Chris says:
> >
> > "There were 64 nfsd threads running on the server.
> >
> > There were 7 hypervisors running 2 VMs each running 2 and 4 threads per
> > VM. Thus, 56 and 112 threads total."
>
> Thanks!
>
Good questions all around. I'll try to answer them as best I can:
> Results that someone else could reproduce would be much better.
> (Where's the source code for the test?
The test is just fio (which is available in the fedora repos, fwiw):
http://git.kernel.dk/?p=fio.git;a=summary
...but we'd have to ask Chris for the job files. Chris, can those be
released?
> What's the base the patchset was
> applied to?
The base was a v3.14-ish kernel with a pile of patches on top (mostly,
the ones that Trond asked you to merge for v3.18). The only difference
between the "baseline" and "patched" kernels is this set, plus a few
patches from upstream that made it apply more cleanly. None of those
should have much effect on the results though.
> What was the hardware?
Again, I'll have to defer that question to Chris. I don't know much
about the hw in use here, other than that it has some pretty fast
storage (high perf. SSDs).
> I understand that's a lot of
> information.) But it's nice to see some numbers at least.
>
> (I wonder what the reason is for the odd shape in the 112-thread case
> (descending slightly as the writes decrease and then shooting up when
> they go to zero.) OK, I guess that's what you get if you just assume
> read-write contention is expensive and one write is slightly more
> expensive than one read. But then why doesn't it behave the same way in
> the 56-thread case?)
>
Yeah, I wondered about that too.
There is some virtualization in use on the clients here (and it's
vmware too), so I have to wonder if there's some variance in the
numbers due to weirdo virt behaviors or something.
The good news is that the overall trend pretty clearly shows a
performance increase.
As always, benchmark results point out the need for more benchmarks.
--
Jeff Layton <[email protected]>
On Tue, Nov 25, 2014 at 07:38:18PM -0500, Jeff Layton wrote:
> On Tue, 25 Nov 2014 19:09:41 -0500
> "J. Bruce Fields" <[email protected]> wrote:
> > I understand that's a lot of
> > information.) But it's nice to see some numbers at least.
> >
> > (I wonder what the reason is for the odd shape in the 112-thread case
> > (descending slightly as the writes decrease and then shooting up when
> > they go to zero.) OK, I guess that's what you get if you just assume
> > read-write contention is expensive and one write is slightly more
> > expensive than one read. But then why doesn't it behave the same way in
> > the 56-thread case?)
> >
>
> Yeah, I wondered about that too.
I was also forgetting that these are percentage increases.
For the future something that gave just the before and after numbers
side-by-side might be easier to think about?
> There is some virtualization in use on the clients here (and it's
> vmware too), so I have to wonder if there's some variance in the
> numbers due to weirdo virt behaviors or something.
>
> The good news is that the overall trend pretty clearly shows a
> performance increase.
Yep, sure.
--b.
>
> As always, benchmark results point out the need for more benchmarks.
>
> --
> Jeff Layton <[email protected]>
On Tue, 25 Nov 2014 21:40:07 -0500
"J. Bruce Fields" <[email protected]> wrote:
> On Tue, Nov 25, 2014 at 07:38:18PM -0500, Jeff Layton wrote:
> > On Tue, 25 Nov 2014 19:09:41 -0500
> > "J. Bruce Fields" <[email protected]> wrote:
> > > I understand that's a lot of
> > > information.) But it's nice to see some numbers at least.
> > >
> > > (I wonder what the reason is for the odd shape in the 112-thread case
> > > (descending slightly as the writes decrease and then shooting up when
> > > they go to zero.) OK, I guess that's what you get if you just assume
> > > read-write contention is expensive and one write is slightly more
> > > expensive than one read. But then why doesn't it behave the same way in
> > > the 56-thread case?)
> > >
> >
> > Yeah, I wondered about that too.
>
> I was also forgetting that these are percentage increases.
>
> For the future something that gave just the before and after numbers
> side-by-side might be easier to think about?
>
Alas, that's what I can't release here. One of the perils of working at
a secretive startup. The part I can talk about is the fio job he was
using to test it:
"It was a fio 70/30 r/w random mix, where every thread was on a
separate file."
...and this about the server hardware:
"On the server side it was Dual 2.6GHz Ivy-bridge 8-core w/ hyper
threading enabled w/ 128GB RAM"
Unfortunately, I can't tell much about the underlying storage on the
server, other than that it's quite fast.
> > There is some virtualization in use on the clients here (and it's
> > vmware too), so I have to wonder if there's some variance in the
> > numbers due to weirdo virt behaviors or something.
> >
> > The good news is that the overall trend pretty clearly shows a
> > performance increase.
>
> Yep, sure.
>
> >
> > As always, benchmark results point out the need for more benchmarks.
> >
> > --
> > Jeff Layton <[email protected]>
--
Jeff Layton <[email protected]>
On Fri, Nov 21, 2014 at 02:19:28PM -0500, Jeff Layton wrote:
> ...also make the manipulation of sp_all_threads list use RCU-friendly
> functions.
>
> Signed-off-by: Jeff Layton <[email protected]>
> Tested-by: Chris Worley <[email protected]>
> ---
> include/linux/sunrpc/svc.h | 2 ++
> include/trace/events/sunrpc.h | 3 ++-
> net/sunrpc/svc.c | 10 ++++++----
> 3 files changed, 10 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 5f0ab39bf7c3..7f80a99c59e4 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -223,6 +223,7 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
> struct svc_rqst {
> struct list_head rq_list; /* idle list */
> struct list_head rq_all; /* all threads list */
> + struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
> struct svc_xprt * rq_xprt; /* transport ptr */
>
> struct sockaddr_storage rq_addr; /* peer address */
> @@ -262,6 +263,7 @@ struct svc_rqst {
> #define RQ_SPLICE_OK (4) /* turned off in gss privacy
> * to prevent encrypting page
> * cache pages */
> +#define RQ_VICTIM (5) /* about to be shut down */
> unsigned long rq_flags; /* flags field */
>
> void * rq_argp; /* decoded arguments */
> diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
> index 5848fc235869..08a5fed50f34 100644
> --- a/include/trace/events/sunrpc.h
> +++ b/include/trace/events/sunrpc.h
> @@ -418,7 +418,8 @@ TRACE_EVENT(xs_tcp_data_recv,
> { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \
> { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
> { (1UL << RQ_DROPME), "RQ_DROPME"}, \
> - { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"})
> + { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
> + { (1UL << RQ_VICTIM), "RQ_VICTIM"})
>
> TRACE_EVENT(svc_recv,
> TP_PROTO(struct svc_rqst *rqst, int status),
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 5d9a443d21f6..4edef32f3b9f 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -616,7 +616,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
> serv->sv_nrthreads++;
> spin_lock_bh(&pool->sp_lock);
> pool->sp_nrthreads++;
> - list_add(&rqstp->rq_all, &pool->sp_all_threads);
> + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
> spin_unlock_bh(&pool->sp_lock);
> rqstp->rq_server = serv;
> rqstp->rq_pool = pool;
> @@ -684,7 +684,8 @@ found_pool:
> * so we don't try to kill it again.
> */
> rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
> - list_del_init(&rqstp->rq_all);
> + set_bit(RQ_VICTIM, &rqstp->rq_flags);
> + list_del_rcu(&rqstp->rq_all);
> task = rqstp->rq_task;
> }
> spin_unlock_bh(&pool->sp_lock);
> @@ -782,10 +783,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
>
> spin_lock_bh(&pool->sp_lock);
> pool->sp_nrthreads--;
> - list_del(&rqstp->rq_all);
> + if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
> + list_del_rcu(&rqstp->rq_all);
Both users of RQ_VICTIM are under the sp_lock, so we don't really need
an atomic test_and_set_bit, do we?
But I guess svc_exit_thread probably still needs to check for the case
where it's called on a thread that svc_set_num_threads has already
chosen, and this works even if it's overkill. OK, fine.
--b.
> spin_unlock_bh(&pool->sp_lock);
>
> - kfree(rqstp);
> + kfree_rcu(rqstp, rq_rcu_head);
>
> /* Release the server */
> if (serv)
> --
> 2.1.0
>
On Mon, 1 Dec 2014 17:44:07 -0500
"J. Bruce Fields" <[email protected]> wrote:
> On Fri, Nov 21, 2014 at 02:19:28PM -0500, Jeff Layton wrote:
> > ...also make the manipulation of sp_all_threads list use RCU-friendly
> > functions.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > Tested-by: Chris Worley <[email protected]>
> > ---
> > include/linux/sunrpc/svc.h | 2 ++
> > include/trace/events/sunrpc.h | 3 ++-
> > net/sunrpc/svc.c | 10 ++++++----
> > 3 files changed, 10 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> > index 5f0ab39bf7c3..7f80a99c59e4 100644
> > --- a/include/linux/sunrpc/svc.h
> > +++ b/include/linux/sunrpc/svc.h
> > @@ -223,6 +223,7 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
> > struct svc_rqst {
> > struct list_head rq_list; /* idle list */
> > struct list_head rq_all; /* all threads list */
> > + struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
> > struct svc_xprt * rq_xprt; /* transport ptr */
> >
> > struct sockaddr_storage rq_addr; /* peer address */
> > @@ -262,6 +263,7 @@ struct svc_rqst {
> > #define RQ_SPLICE_OK (4) /* turned off in gss privacy
> > * to prevent encrypting page
> > * cache pages */
> > +#define RQ_VICTIM (5) /* about to be shut down */
> > unsigned long rq_flags; /* flags field */
> >
> > void * rq_argp; /* decoded arguments */
> > diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
> > index 5848fc235869..08a5fed50f34 100644
> > --- a/include/trace/events/sunrpc.h
> > +++ b/include/trace/events/sunrpc.h
> > @@ -418,7 +418,8 @@ TRACE_EVENT(xs_tcp_data_recv,
> > { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \
> > { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
> > { (1UL << RQ_DROPME), "RQ_DROPME"}, \
> > - { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"})
> > + { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
> > + { (1UL << RQ_VICTIM), "RQ_VICTIM"})
> >
> > TRACE_EVENT(svc_recv,
> > TP_PROTO(struct svc_rqst *rqst, int status),
> > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> > index 5d9a443d21f6..4edef32f3b9f 100644
> > --- a/net/sunrpc/svc.c
> > +++ b/net/sunrpc/svc.c
> > @@ -616,7 +616,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
> > serv->sv_nrthreads++;
> > spin_lock_bh(&pool->sp_lock);
> > pool->sp_nrthreads++;
> > - list_add(&rqstp->rq_all, &pool->sp_all_threads);
> > + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
> > spin_unlock_bh(&pool->sp_lock);
> > rqstp->rq_server = serv;
> > rqstp->rq_pool = pool;
> > @@ -684,7 +684,8 @@ found_pool:
> > * so we don't try to kill it again.
> > */
> > rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
> > - list_del_init(&rqstp->rq_all);
> > + set_bit(RQ_VICTIM, &rqstp->rq_flags);
> > + list_del_rcu(&rqstp->rq_all);
> > task = rqstp->rq_task;
> > }
> > spin_unlock_bh(&pool->sp_lock);
> > @@ -782,10 +783,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
> >
> > spin_lock_bh(&pool->sp_lock);
> > pool->sp_nrthreads--;
> > - list_del(&rqstp->rq_all);
> > + if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
> > + list_del_rcu(&rqstp->rq_all);
>
> Both users of RQ_VICTIM are under the sp_lock, so we don't really need
> an atomic test_and_set_bit, do we?
>
No, it doesn't really need to be an atomic test_and_set_bit. We could
just as easily do:
if (!test_bit(...)) {
set_bit(...)
list_del_rcu()
}
...but this works and I think it makes for easier reading. Is it less
efficient? Maybe, but this is not anywhere near a hot codepath so a
couple of extra cycles really shouldn't matter.
> But I guess svc_exit_thread probably still needs to check for the case
> where it's called on a thread that svc_set_num_threads has already
> chosen, and this works even if it's overkill. OK, fine.
>
Right. We can't use list_del_init in choose_victim anymore because
we're switching this list over to RCU. So, we need some way to know
whether it still needs to be deleted from the list or not. RQ_VICTIM is
what indicates that.
> > spin_unlock_bh(&pool->sp_lock);
> >
> > - kfree(rqstp);
> > + kfree_rcu(rqstp, rq_rcu_head);
> >
> > /* Release the server */
> > if (serv)
> > --
> > 2.1.0
> >
--
Jeff Layton <[email protected]>
On Mon, Dec 1, 2014 at 6:05 PM, Jeff Layton <[email protected]> wrote:
> On Mon, 1 Dec 2014 17:44:07 -0500
> "J. Bruce Fields" <[email protected]> wrote:
>
>> On Fri, Nov 21, 2014 at 02:19:28PM -0500, Jeff Layton wrote:
>> > ...also make the manipulation of sp_all_threads list use RCU-friendly
>> > functions.
>> >
>> > Signed-off-by: Jeff Layton <[email protected]>
>> > Tested-by: Chris Worley <[email protected]>
>> > ---
>> > include/linux/sunrpc/svc.h | 2 ++
>> > include/trace/events/sunrpc.h | 3 ++-
>> > net/sunrpc/svc.c | 10 ++++++----
>> > 3 files changed, 10 insertions(+), 5 deletions(-)
>> >
>> > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>> > index 5f0ab39bf7c3..7f80a99c59e4 100644
>> > --- a/include/linux/sunrpc/svc.h
>> > +++ b/include/linux/sunrpc/svc.h
>> > @@ -223,6 +223,7 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
>> > struct svc_rqst {
>> > struct list_head rq_list; /* idle list */
>> > struct list_head rq_all; /* all threads list */
>> > + struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
>> > struct svc_xprt * rq_xprt; /* transport ptr */
>> >
>> > struct sockaddr_storage rq_addr; /* peer address */
>> > @@ -262,6 +263,7 @@ struct svc_rqst {
>> > #define RQ_SPLICE_OK (4) /* turned off in gss privacy
>> > * to prevent encrypting page
>> > * cache pages */
>> > +#define RQ_VICTIM (5) /* about to be shut down */
>> > unsigned long rq_flags; /* flags field */
>> >
>> > void * rq_argp; /* decoded arguments */
>> > diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
>> > index 5848fc235869..08a5fed50f34 100644
>> > --- a/include/trace/events/sunrpc.h
>> > +++ b/include/trace/events/sunrpc.h
>> > @@ -418,7 +418,8 @@ TRACE_EVENT(xs_tcp_data_recv,
>> > { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \
>> > { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
>> > { (1UL << RQ_DROPME), "RQ_DROPME"}, \
>> > - { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"})
>> > + { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
>> > + { (1UL << RQ_VICTIM), "RQ_VICTIM"})
>> >
>> > TRACE_EVENT(svc_recv,
>> > TP_PROTO(struct svc_rqst *rqst, int status),
>> > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> > index 5d9a443d21f6..4edef32f3b9f 100644
>> > --- a/net/sunrpc/svc.c
>> > +++ b/net/sunrpc/svc.c
>> > @@ -616,7 +616,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>> > serv->sv_nrthreads++;
>> > spin_lock_bh(&pool->sp_lock);
>> > pool->sp_nrthreads++;
>> > - list_add(&rqstp->rq_all, &pool->sp_all_threads);
>> > + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
>> > spin_unlock_bh(&pool->sp_lock);
>> > rqstp->rq_server = serv;
>> > rqstp->rq_pool = pool;
>> > @@ -684,7 +684,8 @@ found_pool:
>> > * so we don't try to kill it again.
>> > */
>> > rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
>> > - list_del_init(&rqstp->rq_all);
>> > + set_bit(RQ_VICTIM, &rqstp->rq_flags);
>> > + list_del_rcu(&rqstp->rq_all);
>> > task = rqstp->rq_task;
>> > }
>> > spin_unlock_bh(&pool->sp_lock);
>> > @@ -782,10 +783,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
>> >
>> > spin_lock_bh(&pool->sp_lock);
>> > pool->sp_nrthreads--;
>> > - list_del(&rqstp->rq_all);
>> > + if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
>> > + list_del_rcu(&rqstp->rq_all);
>>
>> Both users of RQ_VICTIM are under the sp_lock, so we don't really need
>> an atomic test_and_set_bit, do we?
>>
>
> No, it doesn't really need to be an atomic test_and_set_bit. We could
> just as easily do:
>
> if (!test_bit(...)) {
> set_bit(...)
> list_del_rcu()
> }
Isn't there a chance that the non-atomic version might end up
clobbering one of the other bits that is not set/cleared under the
sp_lock?
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Mon, 1 Dec 2014 18:36:16 -0500
Trond Myklebust <[email protected]> wrote:
> On Mon, Dec 1, 2014 at 6:05 PM, Jeff Layton <[email protected]> wrote:
> > On Mon, 1 Dec 2014 17:44:07 -0500
> > "J. Bruce Fields" <[email protected]> wrote:
> >
> >> On Fri, Nov 21, 2014 at 02:19:28PM -0500, Jeff Layton wrote:
> >> > ...also make the manipulation of sp_all_threads list use RCU-friendly
> >> > functions.
> >> >
> >> > Signed-off-by: Jeff Layton <[email protected]>
> >> > Tested-by: Chris Worley <[email protected]>
> >> > ---
> >> > include/linux/sunrpc/svc.h | 2 ++
> >> > include/trace/events/sunrpc.h | 3 ++-
> >> > net/sunrpc/svc.c | 10 ++++++----
> >> > 3 files changed, 10 insertions(+), 5 deletions(-)
> >> >
> >> > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> >> > index 5f0ab39bf7c3..7f80a99c59e4 100644
> >> > --- a/include/linux/sunrpc/svc.h
> >> > +++ b/include/linux/sunrpc/svc.h
> >> > @@ -223,6 +223,7 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
> >> > struct svc_rqst {
> >> > struct list_head rq_list; /* idle list */
> >> > struct list_head rq_all; /* all threads list */
> >> > + struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
> >> > struct svc_xprt * rq_xprt; /* transport ptr */
> >> >
> >> > struct sockaddr_storage rq_addr; /* peer address */
> >> > @@ -262,6 +263,7 @@ struct svc_rqst {
> >> > #define RQ_SPLICE_OK (4) /* turned off in gss privacy
> >> > * to prevent encrypting page
> >> > * cache pages */
> >> > +#define RQ_VICTIM (5) /* about to be shut down */
> >> > unsigned long rq_flags; /* flags field */
> >> >
> >> > void * rq_argp; /* decoded arguments */
> >> > diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
> >> > index 5848fc235869..08a5fed50f34 100644
> >> > --- a/include/trace/events/sunrpc.h
> >> > +++ b/include/trace/events/sunrpc.h
> >> > @@ -418,7 +418,8 @@ TRACE_EVENT(xs_tcp_data_recv,
> >> > { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \
> >> > { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
> >> > { (1UL << RQ_DROPME), "RQ_DROPME"}, \
> >> > - { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"})
> >> > + { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
> >> > + { (1UL << RQ_VICTIM), "RQ_VICTIM"})
> >> >
> >> > TRACE_EVENT(svc_recv,
> >> > TP_PROTO(struct svc_rqst *rqst, int status),
> >> > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> >> > index 5d9a443d21f6..4edef32f3b9f 100644
> >> > --- a/net/sunrpc/svc.c
> >> > +++ b/net/sunrpc/svc.c
> >> > @@ -616,7 +616,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
> >> > serv->sv_nrthreads++;
> >> > spin_lock_bh(&pool->sp_lock);
> >> > pool->sp_nrthreads++;
> >> > - list_add(&rqstp->rq_all, &pool->sp_all_threads);
> >> > + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
> >> > spin_unlock_bh(&pool->sp_lock);
> >> > rqstp->rq_server = serv;
> >> > rqstp->rq_pool = pool;
> >> > @@ -684,7 +684,8 @@ found_pool:
> >> > * so we don't try to kill it again.
> >> > */
> >> > rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
> >> > - list_del_init(&rqstp->rq_all);
> >> > + set_bit(RQ_VICTIM, &rqstp->rq_flags);
> >> > + list_del_rcu(&rqstp->rq_all);
> >> > task = rqstp->rq_task;
> >> > }
> >> > spin_unlock_bh(&pool->sp_lock);
> >> > @@ -782,10 +783,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
> >> >
> >> > spin_lock_bh(&pool->sp_lock);
> >> > pool->sp_nrthreads--;
> >> > - list_del(&rqstp->rq_all);
> >> > + if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
> >> > + list_del_rcu(&rqstp->rq_all);
> >>
> >> Both users of RQ_VICTIM are under the sp_lock, so we don't really need
> >> an atomic test_and_set_bit, do we?
> >>
> >
> > No, it doesn't really need to be an atomic test_and_set_bit. We could
> > just as easily do:
> >
> > if (!test_bit(...)) {
> > set_bit(...)
> > list_del_rcu()
> > }
>
> Isn't there a chance that the non-atomic version might end up
> clobbering one of the other bits that is not set/cleared under the
> sp_lock?
>
There are two atomicity "concerns" here...
The main thing is to ensure that we use set_bit or test_and_set_bit to
set the flag. What we *can't* use is __set_bit which is non-atomic
or we'd end up hitting the exact problem you're talking about (possibly
changing an unrelated flag in the field that happened to flip at nearly
the same time).
What's not necessary here is to use test_and_set_bit since all of this
is done under spinlock anyway. In principle, we could do a test_bit and
then follow that up with a set_bit if it's clear. But, I don't think
that really buys us much, and tend to find the test_and_set_bit to be
clearer when reading the code.
--
Jeff Layton <[email protected]>
On Mon, Dec 1, 2014 at 7:29 PM, Jeff Layton <[email protected]> wrote:
> There are two atomicity "concerns" here...
>
> The main thing is to ensure that we use set_bit or test_and_set_bit to
> set the flag. What we *can't* use is __set_bit which is non-atomic
> or we'd end up hitting the exact problem you're talking about (possibly
> changing an unrelated flag in the field that happened to flip at nearly
> the same time).
>
> What's not necessary here is to use test_and_set_bit since all of this
> is done under spinlock anyway. In principle, we could do a test_bit and
> then follow that up with a set_bit if it's clear. But, I don't think
> that really buys us much, and tend to find the test_and_set_bit to be
> clearer when reading the code.
Fair enough. I too would be surprised if you could actually measure
that performance difference in the thread kill code.
Cheers,
Trond
On Mon, Dec 01, 2014 at 07:52:20PM -0500, Trond Myklebust wrote:
> On Mon, Dec 1, 2014 at 7:29 PM, Jeff Layton <[email protected]> wrote:
> > There are two atomicity "concerns" here...
> >
> > The main thing is to ensure that we use set_bit or test_and_set_bit to
> > set the flag. What we *can't* use is __set_bit which is non-atomic
> > or we'd end up hitting the exact problem you're talking about (possibly
> > changing an unrelated flag in the field that happened to flip at nearly
> > the same time).
> >
> > What's not necessary here is to use test_and_set_bit since all of this
> > is done under spinlock anyway. In principle, we could do a test_bit and
> > then follow that up with a set_bit if it's clear. But, I don't think
> > that really buys us much, and tend to find the test_and_set_bit to be
> > clearer when reading the code.
>
> Fair enough. I too would be surprised if you could actually measure
> that performance difference in the thread kill code.
Yeah, please don't bother, I was just thinking aloud here.
--b.
On Fri, Nov 21, 2014 at 02:19:30PM -0500, Jeff Layton wrote:
> Testing has shown that the pool->sp_lock can be a bottleneck on a busy
> server. Every time data is received on a socket, the server must take
> that lock in order to dequeue a thread from the sp_threads list.
>
> Address this problem by eliminating the sp_threads list (which contains
> threads that are currently idle) and replacing it with a RQ_BUSY flag in
> svc_rqst. This allows us to walk the sp_all_threads list under the
> rcu_read_lock and find a suitable thread for the xprt by doing a
> test_and_set_bit.
>
> Note that we do still have a potential atomicity problem however with
> this approach. We don't want svc_xprt_do_enqueue to set the
> rqst->rq_xprt pointer unless a test_and_set_bit of RQ_BUSY returned
> negative (which indicates that the thread was idle). But, by the time we
> check that, the big could be flipped by a waking thread.
(Nits: replacing "negative" by "zero" and "big" by "bit".)
> To address this, we acquire a new per-rqst spinlock (rq_lock) and take
> that before doing the test_and_set_bit. If that returns false, then we
> can set rq_xprt and drop the spinlock. Then, when the thread wakes up,
> it must set the bit under the same spinlock and can trust that if it was
> already set then the rq_xprt is also properly set.
>
> With this scheme, the case where we have an idle thread no longer needs
> to take the highly contended pool->sp_lock at all, and that removes the
> bottleneck.
>
> That still leaves one issue: What of the case where we walk the whole
> sp_all_threads list and don't find an idle thread? Because the search is
> lockess, it's possible for the queueing to race with a thread that is
> going to sleep. To address that, we queue the xprt and then search again.
>
> If we find an idle thread at that point, we can't attach the xprt to it
> directly since that might race with a different thread waking up and
> finding it. All we can do is wake the idle thread back up and let it
> attempt to find the now-queued xprt.
I find it hard to think about how we expect this to affect performance.
So it comes down to the observed results, I guess, but just trying to
get an idea:
- this eliminates sp_lock. I think the original idea here was
that if interrupts could be routed correctly then there
shouldn't normally be cross-cpu contention on this lock. Do
we understand why that didn't pan out? Is hardware capable of
doing this really rare, or is it just too hard to configure it
correctly?
- instead we're walking the list of all threads looking for an
idle one. I suppose that's tpyically not more than a few
hundred. Does this being fast depend on the fact that that
list is almost never changed? Should we be rearranging
svc_rqst so frequently-written fields aren't nearby?
--b.
>
> Signed-off-by: Jeff Layton <[email protected]>
> Tested-by: Chris Worley <[email protected]>
> ---
> include/linux/sunrpc/svc.h | 4 +-
> include/trace/events/sunrpc.h | 3 +-
> net/sunrpc/svc.c | 7 +-
> net/sunrpc/svc_xprt.c | 221 ++++++++++++++++++++++++------------------
> 4 files changed, 132 insertions(+), 103 deletions(-)
>
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 513957eba0a5..6f22cfeef5e3 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -45,7 +45,6 @@ struct svc_pool_stats {
> struct svc_pool {
> unsigned int sp_id; /* pool id; also node id on NUMA */
> spinlock_t sp_lock; /* protects all fields */
> - struct list_head sp_threads; /* idle server threads */
> struct list_head sp_sockets; /* pending sockets */
> unsigned int sp_nrthreads; /* # of threads in pool */
> struct list_head sp_all_threads; /* all server threads */
> @@ -221,7 +220,6 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
> * processed.
> */
> struct svc_rqst {
> - struct list_head rq_list; /* idle list */
> struct list_head rq_all; /* all threads list */
> struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
> struct svc_xprt * rq_xprt; /* transport ptr */
> @@ -264,6 +262,7 @@ struct svc_rqst {
> * to prevent encrypting page
> * cache pages */
> #define RQ_VICTIM (5) /* about to be shut down */
> +#define RQ_BUSY (6) /* request is busy */
> unsigned long rq_flags; /* flags field */
>
> void * rq_argp; /* decoded arguments */
> @@ -285,6 +284,7 @@ struct svc_rqst {
> struct auth_domain * rq_gssclient; /* "gss/"-style peer info */
> struct svc_cacherep * rq_cacherep; /* cache info */
> struct task_struct *rq_task; /* service thread */
> + spinlock_t rq_lock; /* per-request lock */
> };
>
> #define SVC_NET(svc_rqst) (svc_rqst->rq_xprt->xpt_net)
> diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
> index 08a5fed50f34..ee4438a63a48 100644
> --- a/include/trace/events/sunrpc.h
> +++ b/include/trace/events/sunrpc.h
> @@ -419,7 +419,8 @@ TRACE_EVENT(xs_tcp_data_recv,
> { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \
> { (1UL << RQ_DROPME), "RQ_DROPME"}, \
> { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \
> - { (1UL << RQ_VICTIM), "RQ_VICTIM"})
> + { (1UL << RQ_VICTIM), "RQ_VICTIM"}, \
> + { (1UL << RQ_BUSY), "RQ_BUSY"})
>
> TRACE_EVENT(svc_recv,
> TP_PROTO(struct svc_rqst *rqst, int status),
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 4edef32f3b9f..4308881d9d0a 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -476,7 +476,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
> i, serv->sv_name);
>
> pool->sp_id = i;
> - INIT_LIST_HEAD(&pool->sp_threads);
> INIT_LIST_HEAD(&pool->sp_sockets);
> INIT_LIST_HEAD(&pool->sp_all_threads);
> spin_lock_init(&pool->sp_lock);
> @@ -614,12 +613,14 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
> goto out_enomem;
>
> serv->sv_nrthreads++;
> + __set_bit(RQ_BUSY, &rqstp->rq_flags);
> + spin_lock_init(&rqstp->rq_lock);
> + rqstp->rq_server = serv;
> + rqstp->rq_pool = pool;
> spin_lock_bh(&pool->sp_lock);
> pool->sp_nrthreads++;
> list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
> spin_unlock_bh(&pool->sp_lock);
> - rqstp->rq_server = serv;
> - rqstp->rq_pool = pool;
>
> rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
> if (!rqstp->rq_argp)
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index 579ff2249562..ed90d955f733 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -310,25 +310,6 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
> }
> EXPORT_SYMBOL_GPL(svc_print_addr);
>
> -/*
> - * Queue up an idle server thread. Must have pool->sp_lock held.
> - * Note: this is really a stack rather than a queue, so that we only
> - * use as many different threads as we need, and the rest don't pollute
> - * the cache.
> - */
> -static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
> -{
> - list_add(&rqstp->rq_list, &pool->sp_threads);
> -}
> -
> -/*
> - * Dequeue an nfsd thread. Must have pool->sp_lock held.
> - */
> -static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
> -{
> - list_del(&rqstp->rq_list);
> -}
> -
> static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
> {
> if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
> @@ -343,6 +324,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> struct svc_pool *pool;
> struct svc_rqst *rqstp;
> int cpu;
> + bool queued = false;
>
> if (!svc_xprt_has_something_to_do(xprt))
> return;
> @@ -360,37 +342,60 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>
> cpu = get_cpu();
> pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
> - spin_lock_bh(&pool->sp_lock);
>
> atomic_long_inc(&pool->sp_stats.packets);
>
> - if (!list_empty(&pool->sp_threads)) {
> - rqstp = list_entry(pool->sp_threads.next,
> - struct svc_rqst,
> - rq_list);
> - dprintk("svc: transport %p served by daemon %p\n",
> - xprt, rqstp);
> - svc_thread_dequeue(pool, rqstp);
> - if (rqstp->rq_xprt)
> - printk(KERN_ERR
> - "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
> - rqstp, rqstp->rq_xprt);
> - /* Note the order of the following 3 lines:
> - * We want to assign xprt to rqstp->rq_xprt only _after_
> - * we've woken up the process, so that we don't race with
> - * the lockless check in svc_get_next_xprt().
> +redo_search:
> + /* find a thread for this xprt */
> + rcu_read_lock();
> + list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
> + /* Do a lockless check first */
> + if (test_bit(RQ_BUSY, &rqstp->rq_flags))
> + continue;
> +
> + /*
> + * Once the xprt has been queued, it can only be dequeued by
> + * the task that intends to service it. All we can do at that
> + * point is to try to wake this thread back up so that it can
> + * do so.
> */
> - svc_xprt_get(xprt);
> - wake_up_process(rqstp->rq_task);
> - rqstp->rq_xprt = xprt;
> + if (!queued) {
> + spin_lock_bh(&rqstp->rq_lock);
> + if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) {
> + /* already busy, move on... */
> + spin_unlock_bh(&rqstp->rq_lock);
> + continue;
> + }
> +
> + /* this one will do */
> + rqstp->rq_xprt = xprt;
> + svc_xprt_get(xprt);
> + spin_unlock_bh(&rqstp->rq_lock);
> + }
> + rcu_read_unlock();
> +
> atomic_long_inc(&pool->sp_stats.threads_woken);
> - } else {
> + wake_up_process(rqstp->rq_task);
> + put_cpu();
> + return;
> + }
> + rcu_read_unlock();
> +
> + /*
> + * We didn't find an idle thread to use, so we need to queue the xprt.
> + * Do so and then search again. If we find one, we can't hook this one
> + * up to it directly but we can wake the thread up in the hopes that it
> + * will pick it up once it searches for a xprt to service.
> + */
> + if (!queued) {
> + queued = true;
> dprintk("svc: transport %p put into queue\n", xprt);
> + spin_lock_bh(&pool->sp_lock);
> list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
> pool->sp_stats.sockets_queued++;
> + spin_unlock_bh(&pool->sp_lock);
> + goto redo_search;
> }
> -
> - spin_unlock_bh(&pool->sp_lock);
> put_cpu();
> }
>
> @@ -408,21 +413,26 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
> EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
>
> /*
> - * Dequeue the first transport. Must be called with the pool->sp_lock held.
> + * Dequeue the first transport, if there is one.
> */
> static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
> {
> - struct svc_xprt *xprt;
> + struct svc_xprt *xprt = NULL;
>
> if (list_empty(&pool->sp_sockets))
> return NULL;
>
> - xprt = list_entry(pool->sp_sockets.next,
> - struct svc_xprt, xpt_ready);
> - list_del_init(&xprt->xpt_ready);
> + spin_lock_bh(&pool->sp_lock);
> + if (likely(!list_empty(&pool->sp_sockets))) {
> + xprt = list_first_entry(&pool->sp_sockets,
> + struct svc_xprt, xpt_ready);
> + list_del_init(&xprt->xpt_ready);
> + svc_xprt_get(xprt);
>
> - dprintk("svc: transport %p dequeued, inuse=%d\n",
> - xprt, atomic_read(&xprt->xpt_ref.refcount));
> + dprintk("svc: transport %p dequeued, inuse=%d\n",
> + xprt, atomic_read(&xprt->xpt_ref.refcount));
> + }
> + spin_unlock_bh(&pool->sp_lock);
>
> return xprt;
> }
> @@ -497,16 +507,21 @@ void svc_wake_up(struct svc_serv *serv)
>
> pool = &serv->sv_pools[0];
>
> - spin_lock_bh(&pool->sp_lock);
> - if (!list_empty(&pool->sp_threads)) {
> - rqstp = list_entry(pool->sp_threads.next,
> - struct svc_rqst,
> - rq_list);
> + rcu_read_lock();
> + list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
> + /* skip any that aren't queued */
> + if (test_bit(RQ_BUSY, &rqstp->rq_flags))
> + continue;
> + rcu_read_unlock();
> dprintk("svc: daemon %p woken up.\n", rqstp);
> wake_up_process(rqstp->rq_task);
> - } else
> - set_bit(SP_TASK_PENDING, &pool->sp_flags);
> - spin_unlock_bh(&pool->sp_lock);
> + return;
> + }
> + rcu_read_unlock();
> +
> + /* No free entries available */
> + set_bit(SP_TASK_PENDING, &pool->sp_flags);
> + smp_wmb();
> }
> EXPORT_SYMBOL_GPL(svc_wake_up);
>
> @@ -617,22 +632,47 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
> return 0;
> }
>
> +static bool
> +rqst_should_sleep(struct svc_rqst *rqstp)
> +{
> + struct svc_pool *pool = rqstp->rq_pool;
> +
> + /* did someone call svc_wake_up? */
> + if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags))
> + return false;
> +
> + /* was a socket queued? */
> + if (!list_empty(&pool->sp_sockets))
> + return false;
> +
> + /* are we shutting down? */
> + if (signalled() || kthread_should_stop())
> + return false;
> +
> + /* are we freezing? */
> + if (freezing(current))
> + return false;
> +
> + return true;
> +}
> +
> static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
> {
> struct svc_xprt *xprt;
> struct svc_pool *pool = rqstp->rq_pool;
> long time_left = 0;
>
> + /* rq_xprt should be clear on entry */
> + WARN_ON_ONCE(rqstp->rq_xprt);
> +
> /* Normally we will wait up to 5 seconds for any required
> * cache information to be provided.
> */
> rqstp->rq_chandle.thread_wait = 5*HZ;
>
> - spin_lock_bh(&pool->sp_lock);
> xprt = svc_xprt_dequeue(pool);
> if (xprt) {
> rqstp->rq_xprt = xprt;
> - svc_xprt_get(xprt);
>
> /* As there is a shortage of threads and this request
> * had to be queued, don't allow the thread to wait so
> @@ -640,51 +680,38 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
> */
> rqstp->rq_chandle.thread_wait = 1*HZ;
> clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> - } else {
> - if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) {
> - xprt = ERR_PTR(-EAGAIN);
> - goto out;
> - }
> - /*
> - * We have to be able to interrupt this wait
> - * to bring down the daemons ...
> - */
> - set_current_state(TASK_INTERRUPTIBLE);
> + return xprt;
> + }
>
> - /* No data pending. Go to sleep */
> - svc_thread_enqueue(pool, rqstp);
> - spin_unlock_bh(&pool->sp_lock);
> + /*
> + * We have to be able to interrupt this wait
> + * to bring down the daemons ...
> + */
> + set_current_state(TASK_INTERRUPTIBLE);
> + clear_bit(RQ_BUSY, &rqstp->rq_flags);
> + smp_mb();
> +
> + if (likely(rqst_should_sleep(rqstp)))
> + time_left = schedule_timeout(timeout);
> + else
> + __set_current_state(TASK_RUNNING);
>
> - if (!(signalled() || kthread_should_stop())) {
> - time_left = schedule_timeout(timeout);
> - __set_current_state(TASK_RUNNING);
> + try_to_freeze();
>
> - try_to_freeze();
> + spin_lock_bh(&rqstp->rq_lock);
> + set_bit(RQ_BUSY, &rqstp->rq_flags);
> + spin_unlock_bh(&rqstp->rq_lock);
>
> - xprt = rqstp->rq_xprt;
> - if (xprt != NULL)
> - return xprt;
> - } else
> - __set_current_state(TASK_RUNNING);
> + xprt = rqstp->rq_xprt;
> + if (xprt != NULL)
> + return xprt;
>
> - spin_lock_bh(&pool->sp_lock);
> - if (!time_left)
> - atomic_long_inc(&pool->sp_stats.threads_timedout);
> + if (!time_left)
> + atomic_long_inc(&pool->sp_stats.threads_timedout);
>
> - xprt = rqstp->rq_xprt;
> - if (!xprt) {
> - svc_thread_dequeue(pool, rqstp);
> - spin_unlock_bh(&pool->sp_lock);
> - dprintk("svc: server %p, no data yet\n", rqstp);
> - if (signalled() || kthread_should_stop())
> - return ERR_PTR(-EINTR);
> - else
> - return ERR_PTR(-EAGAIN);
> - }
> - }
> -out:
> - spin_unlock_bh(&pool->sp_lock);
> - return xprt;
> + if (signalled() || kthread_should_stop())
> + return ERR_PTR(-EINTR);
> + return ERR_PTR(-EAGAIN);
> }
>
> static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
> --
> 2.1.0
>
On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> On Fri, Nov 21, 2014 at 02:19:30PM -0500, Jeff Layton wrote:
>> Testing has shown that the pool->sp_lock can be a bottleneck on a busy
>> server. Every time data is received on a socket, the server must take
>> that lock in order to dequeue a thread from the sp_threads list.
>>
>> Address this problem by eliminating the sp_threads list (which contains
>> threads that are currently idle) and replacing it with a RQ_BUSY flag in
>> svc_rqst. This allows us to walk the sp_all_threads list under the
>> rcu_read_lock and find a suitable thread for the xprt by doing a
>> test_and_set_bit.
>>
>> Note that we do still have a potential atomicity problem however with
>> this approach. We don't want svc_xprt_do_enqueue to set the
>> rqst->rq_xprt pointer unless a test_and_set_bit of RQ_BUSY returned
>> negative (which indicates that the thread was idle). But, by the time we
>> check that, the big could be flipped by a waking thread.
>
> (Nits: replacing "negative" by "zero" and "big" by "bit".)
>
>> To address this, we acquire a new per-rqst spinlock (rq_lock) and take
>> that before doing the test_and_set_bit. If that returns false, then we
>> can set rq_xprt and drop the spinlock. Then, when the thread wakes up,
>> it must set the bit under the same spinlock and can trust that if it was
>> already set then the rq_xprt is also properly set.
>>
>> With this scheme, the case where we have an idle thread no longer needs
>> to take the highly contended pool->sp_lock at all, and that removes the
>> bottleneck.
>>
>> That still leaves one issue: What of the case where we walk the whole
>> sp_all_threads list and don't find an idle thread? Because the search is
>> lockess, it's possible for the queueing to race with a thread that is
>> going to sleep. To address that, we queue the xprt and then search again.
>>
>> If we find an idle thread at that point, we can't attach the xprt to it
>> directly since that might race with a different thread waking up and
>> finding it. All we can do is wake the idle thread back up and let it
>> attempt to find the now-queued xprt.
>
> I find it hard to think about how we expect this to affect performance.
> So it comes down to the observed results, I guess, but just trying to
> get an idea:
>
> - this eliminates sp_lock. I think the original idea here was
> that if interrupts could be routed correctly then there
> shouldn't normally be cross-cpu contention on this lock. Do
> we understand why that didn't pan out? Is hardware capable of
> doing this really rare, or is it just too hard to configure it
> correctly?
One problem is that a 1MB incoming write will generate a lot of
interrupts. While that is not so noticeable on a 1GigE network, it is
on a 40GigE network. The other thing you should note is that this
workload was generated with ~100 clients pounding on that server, so
there are a fair amount of TCP connections to service in parallel.
Playing with the interrupt routing doesn't necessarily help you so
much when all those connections are hot.
> - instead we're walking the list of all threads looking for an
> idle one. I suppose that's tpyically not more than a few
> hundred. Does this being fast depend on the fact that that
> list is almost never changed? Should we be rearranging
> svc_rqst so frequently-written fields aren't nearby?
Given a 64-byte cache line, that is 8 pointers worth on a 64-bit processor.
- rq_all, rq_server, rq_pool, rq_task don't ever change, so perhaps
shove them together into the same cacheline?
- rq_xprt does get set often until we have a full RPC request worth of
data, so perhaps consider moving that.
- OTOH, rq_addr, rq_addrlen, rq_daddr, rq_daddrlen are only set once
we have a full RPC to process, and then keep their values until that
RPC call is finished. That doesn't look too bad.
Cheers
Trond
On Mon, 1 Dec 2014 19:38:19 -0500
Trond Myklebust <[email protected]> wrote:
> On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > On Fri, Nov 21, 2014 at 02:19:30PM -0500, Jeff Layton wrote:
> >> Testing has shown that the pool->sp_lock can be a bottleneck on a busy
> >> server. Every time data is received on a socket, the server must take
> >> that lock in order to dequeue a thread from the sp_threads list.
> >>
> >> Address this problem by eliminating the sp_threads list (which contains
> >> threads that are currently idle) and replacing it with a RQ_BUSY flag in
> >> svc_rqst. This allows us to walk the sp_all_threads list under the
> >> rcu_read_lock and find a suitable thread for the xprt by doing a
> >> test_and_set_bit.
> >>
> >> Note that we do still have a potential atomicity problem however with
> >> this approach. We don't want svc_xprt_do_enqueue to set the
> >> rqst->rq_xprt pointer unless a test_and_set_bit of RQ_BUSY returned
> >> negative (which indicates that the thread was idle). But, by the time we
> >> check that, the big could be flipped by a waking thread.
> >
> > (Nits: replacing "negative" by "zero" and "big" by "bit".)
> >
> >> To address this, we acquire a new per-rqst spinlock (rq_lock) and take
> >> that before doing the test_and_set_bit. If that returns false, then we
> >> can set rq_xprt and drop the spinlock. Then, when the thread wakes up,
> >> it must set the bit under the same spinlock and can trust that if it was
> >> already set then the rq_xprt is also properly set.
> >>
> >> With this scheme, the case where we have an idle thread no longer needs
> >> to take the highly contended pool->sp_lock at all, and that removes the
> >> bottleneck.
> >>
> >> That still leaves one issue: What of the case where we walk the whole
> >> sp_all_threads list and don't find an idle thread? Because the search is
> >> lockess, it's possible for the queueing to race with a thread that is
> >> going to sleep. To address that, we queue the xprt and then search again.
> >>
> >> If we find an idle thread at that point, we can't attach the xprt to it
> >> directly since that might race with a different thread waking up and
> >> finding it. All we can do is wake the idle thread back up and let it
> >> attempt to find the now-queued xprt.
> >
> > I find it hard to think about how we expect this to affect performance.
> > So it comes down to the observed results, I guess, but just trying to
> > get an idea:
> >
> > - this eliminates sp_lock. I think the original idea here was
> > that if interrupts could be routed correctly then there
> > shouldn't normally be cross-cpu contention on this lock. Do
> > we understand why that didn't pan out? Is hardware capable of
> > doing this really rare, or is it just too hard to configure it
> > correctly?
>
> One problem is that a 1MB incoming write will generate a lot of
> interrupts. While that is not so noticeable on a 1GigE network, it is
> on a 40GigE network. The other thing you should note is that this
> workload was generated with ~100 clients pounding on that server, so
> there are a fair amount of TCP connections to service in parallel.
> Playing with the interrupt routing doesn't necessarily help you so
> much when all those connections are hot.
>
> > - instead we're walking the list of all threads looking for an
> > idle one. I suppose that's tpyically not more than a few
> > hundred. Does this being fast depend on the fact that that
> > list is almost never changed? Should we be rearranging
> > svc_rqst so frequently-written fields aren't nearby?
>
> Given a 64-byte cache line, that is 8 pointers worth on a 64-bit processor.
>
> - rq_all, rq_server, rq_pool, rq_task don't ever change, so perhaps
> shove them together into the same cacheline?
>
> - rq_xprt does get set often until we have a full RPC request worth of
> data, so perhaps consider moving that.
>
> - OTOH, rq_addr, rq_addrlen, rq_daddr, rq_daddrlen are only set once
> we have a full RPC to process, and then keep their values until that
> RPC call is finished. That doesn't look too bad.
>
> Cheers
> Trond
--
Jeff Layton <[email protected]>
On Tue, 2 Dec 2014 06:57:50 -0500
Jeff Layton <[email protected]> wrote:
> On Mon, 1 Dec 2014 19:38:19 -0500
> Trond Myklebust <[email protected]> wrote:
>
> > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > On Fri, Nov 21, 2014 at 02:19:30PM -0500, Jeff Layton wrote:
> > >> Testing has shown that the pool->sp_lock can be a bottleneck on a busy
> > >> server. Every time data is received on a socket, the server must take
> > >> that lock in order to dequeue a thread from the sp_threads list.
> > >>
> > >> Address this problem by eliminating the sp_threads list (which contains
> > >> threads that are currently idle) and replacing it with a RQ_BUSY flag in
> > >> svc_rqst. This allows us to walk the sp_all_threads list under the
> > >> rcu_read_lock and find a suitable thread for the xprt by doing a
> > >> test_and_set_bit.
> > >>
> > >> Note that we do still have a potential atomicity problem however with
> > >> this approach. We don't want svc_xprt_do_enqueue to set the
> > >> rqst->rq_xprt pointer unless a test_and_set_bit of RQ_BUSY returned
> > >> negative (which indicates that the thread was idle). But, by the time we
> > >> check that, the big could be flipped by a waking thread.
> > >
> > > (Nits: replacing "negative" by "zero" and "big" by "bit".)
> > >
>
Sorry, hit send too quickly...
Thanks for fixing those.
> > >> To address this, we acquire a new per-rqst spinlock (rq_lock) and take
> > >> that before doing the test_and_set_bit. If that returns false, then we
> > >> can set rq_xprt and drop the spinlock. Then, when the thread wakes up,
> > >> it must set the bit under the same spinlock and can trust that if it was
> > >> already set then the rq_xprt is also properly set.
> > >>
> > >> With this scheme, the case where we have an idle thread no longer needs
> > >> to take the highly contended pool->sp_lock at all, and that removes the
> > >> bottleneck.
> > >>
> > >> That still leaves one issue: What of the case where we walk the whole
> > >> sp_all_threads list and don't find an idle thread? Because the search is
> > >> lockess, it's possible for the queueing to race with a thread that is
> > >> going to sleep. To address that, we queue the xprt and then search again.
> > >>
> > >> If we find an idle thread at that point, we can't attach the xprt to it
> > >> directly since that might race with a different thread waking up and
> > >> finding it. All we can do is wake the idle thread back up and let it
> > >> attempt to find the now-queued xprt.
> > >
> > > I find it hard to think about how we expect this to affect performance.
> > > So it comes down to the observed results, I guess, but just trying to
> > > get an idea:
> > >
> > > - this eliminates sp_lock. I think the original idea here was
> > > that if interrupts could be routed correctly then there
> > > shouldn't normally be cross-cpu contention on this lock. Do
> > > we understand why that didn't pan out? Is hardware capable of
> > > doing this really rare, or is it just too hard to configure it
> > > correctly?
> >
> > One problem is that a 1MB incoming write will generate a lot of
> > interrupts. While that is not so noticeable on a 1GigE network, it is
> > on a 40GigE network. The other thing you should note is that this
> > workload was generated with ~100 clients pounding on that server, so
> > there are a fair amount of TCP connections to service in parallel.
> > Playing with the interrupt routing doesn't necessarily help you so
> > much when all those connections are hot.
> >
In principle though, the percpu pool_mode should have alleviated the
contention on the sp_lock. When an interrupt comes in, the xprt gets
queued to its pool. If there is a pool for each cpu then there should
be no sp_lock contention. The pernode pool mode might also have
alleviated the lock contention to a lesser degree in a NUMA
configuration.
Do we understand why that didn't help?
In any case, I think that doing this with RCU is still preferable.
We're walking a very short list, so doing it lockless is still a
good idea to improve performance without needing to use the percpu
pool_mode.
> > > - instead we're walking the list of all threads looking for an
> > > idle one. I suppose that's tpyically not more than a few
> > > hundred. Does this being fast depend on the fact that that
> > > list is almost never changed? Should we be rearranging
> > > svc_rqst so frequently-written fields aren't nearby?
> >
> > Given a 64-byte cache line, that is 8 pointers worth on a 64-bit processor.
> >
> > - rq_all, rq_server, rq_pool, rq_task don't ever change, so perhaps
> > shove them together into the same cacheline?
> >
> > - rq_xprt does get set often until we have a full RPC request worth of
> > data, so perhaps consider moving that.
> >
> > - OTOH, rq_addr, rq_addrlen, rq_daddr, rq_daddrlen are only set once
> > we have a full RPC to process, and then keep their values until that
> > RPC call is finished. That doesn't look too bad.
> >
That sounds reasonable to me.
--
Jeff Layton <[email protected]>
On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> On Tue, 2 Dec 2014 06:57:50 -0500
> Jeff Layton <[email protected]> wrote:
>
> > On Mon, 1 Dec 2014 19:38:19 -0500
> > Trond Myklebust <[email protected]> wrote:
> >
> > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > I find it hard to think about how we expect this to affect performance.
> > > > So it comes down to the observed results, I guess, but just trying to
> > > > get an idea:
> > > >
> > > > - this eliminates sp_lock. I think the original idea here was
> > > > that if interrupts could be routed correctly then there
> > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > we understand why that didn't pan out? Is hardware capable of
> > > > doing this really rare, or is it just too hard to configure it
> > > > correctly?
> > >
> > > One problem is that a 1MB incoming write will generate a lot of
> > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > on a 40GigE network. The other thing you should note is that this
> > > workload was generated with ~100 clients pounding on that server, so
> > > there are a fair amount of TCP connections to service in parallel.
> > > Playing with the interrupt routing doesn't necessarily help you so
> > > much when all those connections are hot.
> > >
>
> In principle though, the percpu pool_mode should have alleviated the
> contention on the sp_lock. When an interrupt comes in, the xprt gets
> queued to its pool. If there is a pool for each cpu then there should
> be no sp_lock contention. The pernode pool mode might also have
> alleviated the lock contention to a lesser degree in a NUMA
> configuration.
>
> Do we understand why that didn't help?
Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
not entirely orthogonal problem.
(And I thought it should be addressable separately; Trond and I talked
about this in Westford. I think it currently wakes a thread to handle
each individual tcp segment--but shouldn't it be able to do all the data
copying in the interrupt and wait to wake up a thread until it's got the
entire rpc?)
> In any case, I think that doing this with RCU is still preferable.
> We're walking a very short list, so doing it lockless is still a
> good idea to improve performance without needing to use the percpu
> pool_mode.
I find that entirely plausible.
Maybe it would help to ask SGI people. Cc'ing Ben Myers in hopes he
could point us to the right person. It'd be interesting to know:
- are they using the svc_pool stuff?
- if not, why not?
- if so:
- can they explain how they configure systems to take
advantage of it?
- do they have any recent results showing how it helps?
- could they test Jeff's patches for performance
regressions?
Anyway, I'm off for now, back to work Thursday.
--b.
Hey Bruce,
On Tue, Dec 02, 2014 at 11:50:24AM -0500, J. Bruce Fields wrote:
> On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> > On Tue, 2 Dec 2014 06:57:50 -0500
> > Jeff Layton <[email protected]> wrote:
> >
> > > On Mon, 1 Dec 2014 19:38:19 -0500
> > > Trond Myklebust <[email protected]> wrote:
> > >
> > > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > > I find it hard to think about how we expect this to affect performance.
> > > > > So it comes down to the observed results, I guess, but just trying to
> > > > > get an idea:
> > > > >
> > > > > - this eliminates sp_lock. I think the original idea here was
> > > > > that if interrupts could be routed correctly then there
> > > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > > we understand why that didn't pan out? Is hardware capable of
> > > > > doing this really rare, or is it just too hard to configure it
> > > > > correctly?
> > > >
> > > > One problem is that a 1MB incoming write will generate a lot of
> > > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > > on a 40GigE network. The other thing you should note is that this
> > > > workload was generated with ~100 clients pounding on that server, so
> > > > there are a fair amount of TCP connections to service in parallel.
> > > > Playing with the interrupt routing doesn't necessarily help you so
> > > > much when all those connections are hot.
> > > >
> >
> > In principle though, the percpu pool_mode should have alleviated the
> > contention on the sp_lock. When an interrupt comes in, the xprt gets
> > queued to its pool. If there is a pool for each cpu then there should
> > be no sp_lock contention. The pernode pool mode might also have
> > alleviated the lock contention to a lesser degree in a NUMA
> > configuration.
> >
> > Do we understand why that didn't help?
>
> Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
> not entirely orthogonal problem.
>
> (And I thought it should be addressable separately; Trond and I talked
> about this in Westford. I think it currently wakes a thread to handle
> each individual tcp segment--but shouldn't it be able to do all the data
> copying in the interrupt and wait to wake up a thread until it's got the
> entire rpc?)
>
> > In any case, I think that doing this with RCU is still preferable.
> > We're walking a very short list, so doing it lockless is still a
> > good idea to improve performance without needing to use the percpu
> > pool_mode.
>
> I find that entirely plausible.
>
> Maybe it would help to ask SGI people. Cc'ing Ben Myers in hopes he
> could point us to the right person.
>
> It'd be interesting to know:
>
> - are they using the svc_pool stuff?
> - if not, why not?
> - if so:
> - can they explain how they configure systems to take
> advantage of it?
> - do they have any recent results showing how it helps?
> - could they test Jeff's patches for performance
> regressions?
>
> Anyway, I'm off for now, back to work Thursday.
>
> --b.
Andrew Dahl is the right person. Cc'd.
Regards,
Ben
On Tue, Dec 02, 2014 at 12:53:58PM -0600, Ben Myers wrote:
> Hey Bruce,
>
> On Tue, Dec 02, 2014 at 11:50:24AM -0500, J. Bruce Fields wrote:
> > On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> > > On Tue, 2 Dec 2014 06:57:50 -0500
> > > Jeff Layton <[email protected]> wrote:
> > >
> > > > On Mon, 1 Dec 2014 19:38:19 -0500
> > > > Trond Myklebust <[email protected]> wrote:
> > > >
> > > > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > > > I find it hard to think about how we expect this to affect performance.
> > > > > > So it comes down to the observed results, I guess, but just trying to
> > > > > > get an idea:
> > > > > >
> > > > > > - this eliminates sp_lock. I think the original idea here was
> > > > > > that if interrupts could be routed correctly then there
> > > > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > > > we understand why that didn't pan out? Is hardware capable of
> > > > > > doing this really rare, or is it just too hard to configure it
> > > > > > correctly?
> > > > >
> > > > > One problem is that a 1MB incoming write will generate a lot of
> > > > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > > > on a 40GigE network. The other thing you should note is that this
> > > > > workload was generated with ~100 clients pounding on that server, so
> > > > > there are a fair amount of TCP connections to service in parallel.
> > > > > Playing with the interrupt routing doesn't necessarily help you so
> > > > > much when all those connections are hot.
> > > > >
> > >
> > > In principle though, the percpu pool_mode should have alleviated the
> > > contention on the sp_lock. When an interrupt comes in, the xprt gets
> > > queued to its pool. If there is a pool for each cpu then there should
> > > be no sp_lock contention. The pernode pool mode might also have
> > > alleviated the lock contention to a lesser degree in a NUMA
> > > configuration.
> > >
> > > Do we understand why that didn't help?
> >
> > Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
> > not entirely orthogonal problem.
> >
> > (And I thought it should be addressable separately; Trond and I talked
> > about this in Westford. I think it currently wakes a thread to handle
> > each individual tcp segment--but shouldn't it be able to do all the data
> > copying in the interrupt and wait to wake up a thread until it's got the
> > entire rpc?)
> >
> > > In any case, I think that doing this with RCU is still preferable.
> > > We're walking a very short list, so doing it lockless is still a
> > > good idea to improve performance without needing to use the percpu
> > > pool_mode.
> >
> > I find that entirely plausible.
> >
> > Maybe it would help to ask SGI people. Cc'ing Ben Myers in hopes he
> > could point us to the right person.
> >
> > It'd be interesting to know:
> >
> > - are they using the svc_pool stuff?
> > - if not, why not?
> > - if so:
> > - can they explain how they configure systems to take
> > advantage of it?
> > - do they have any recent results showing how it helps?
> > - could they test Jeff's patches for performance
> > regressions?
> >
> > Anyway, I'm off for now, back to work Thursday.
> >
> > --b.
>
> Andrew Dahl is the right person. Cc'd.
Thanks!
I'm less worried about Jeff's particular changes here, but I would still
really love to see answers to the above questions.
We've had a couple cases now of people trying to use the pool_modes for
performance tuning without good results, and I'd like to figure out
what's happening. If this keeps up then we may end up just breaking
them by accident (if we haven't already).
--b.
On Tue, Dec 02, 2014 at 11:50:24AM -0500, J. Bruce Fields wrote:
> On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> > On Tue, 2 Dec 2014 06:57:50 -0500
> > Jeff Layton <[email protected]> wrote:
> >
> > > On Mon, 1 Dec 2014 19:38:19 -0500
> > > Trond Myklebust <[email protected]> wrote:
> > >
> > > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > > I find it hard to think about how we expect this to affect performance.
> > > > > So it comes down to the observed results, I guess, but just trying to
> > > > > get an idea:
> > > > >
> > > > > - this eliminates sp_lock. I think the original idea here was
> > > > > that if interrupts could be routed correctly then there
> > > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > > we understand why that didn't pan out? Is hardware capable of
> > > > > doing this really rare, or is it just too hard to configure it
> > > > > correctly?
> > > >
> > > > One problem is that a 1MB incoming write will generate a lot of
> > > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > > on a 40GigE network. The other thing you should note is that this
> > > > workload was generated with ~100 clients pounding on that server, so
> > > > there are a fair amount of TCP connections to service in parallel.
> > > > Playing with the interrupt routing doesn't necessarily help you so
> > > > much when all those connections are hot.
> > > >
> >
> > In principle though, the percpu pool_mode should have alleviated the
> > contention on the sp_lock. When an interrupt comes in, the xprt gets
> > queued to its pool. If there is a pool for each cpu then there should
> > be no sp_lock contention. The pernode pool mode might also have
> > alleviated the lock contention to a lesser degree in a NUMA
> > configuration.
> >
> > Do we understand why that didn't help?
>
> Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
> not entirely orthogonal problem.
>
> (And I thought it should be addressable separately; Trond and I talked
> about this in Westford. I think it currently wakes a thread to handle
> each individual tcp segment--but shouldn't it be able to do all the data
> copying in the interrupt and wait to wake up a thread until it's got the
> entire rpc?)
By the way, Jeff, isn't this part of what's complicating the workqueue
change? That would seem simpler if we didn't need to queue work until
we had the full rpc.
--b.
On Mon, 8 Dec 2014 13:57:31 -0500
"J. Bruce Fields" <[email protected]> wrote:
> On Tue, Dec 02, 2014 at 11:50:24AM -0500, J. Bruce Fields wrote:
> > On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> > > On Tue, 2 Dec 2014 06:57:50 -0500
> > > Jeff Layton <[email protected]> wrote:
> > >
> > > > On Mon, 1 Dec 2014 19:38:19 -0500
> > > > Trond Myklebust <[email protected]> wrote:
> > > >
> > > > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > > > I find it hard to think about how we expect this to affect performance.
> > > > > > So it comes down to the observed results, I guess, but just trying to
> > > > > > get an idea:
> > > > > >
> > > > > > - this eliminates sp_lock. I think the original idea here was
> > > > > > that if interrupts could be routed correctly then there
> > > > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > > > we understand why that didn't pan out? Is hardware capable of
> > > > > > doing this really rare, or is it just too hard to configure it
> > > > > > correctly?
> > > > >
> > > > > One problem is that a 1MB incoming write will generate a lot of
> > > > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > > > on a 40GigE network. The other thing you should note is that this
> > > > > workload was generated with ~100 clients pounding on that server, so
> > > > > there are a fair amount of TCP connections to service in parallel.
> > > > > Playing with the interrupt routing doesn't necessarily help you so
> > > > > much when all those connections are hot.
> > > > >
> > >
> > > In principle though, the percpu pool_mode should have alleviated the
> > > contention on the sp_lock. When an interrupt comes in, the xprt gets
> > > queued to its pool. If there is a pool for each cpu then there should
> > > be no sp_lock contention. The pernode pool mode might also have
> > > alleviated the lock contention to a lesser degree in a NUMA
> > > configuration.
> > >
> > > Do we understand why that didn't help?
> >
> > Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
> > not entirely orthogonal problem.
> >
> > (And I thought it should be addressable separately; Trond and I talked
> > about this in Westford. I think it currently wakes a thread to handle
> > each individual tcp segment--but shouldn't it be able to do all the data
> > copying in the interrupt and wait to wake up a thread until it's got the
> > entire rpc?)
>
> By the way, Jeff, isn't this part of what's complicating the workqueue
> change? That would seem simpler if we didn't need to queue work until
> we had the full rpc.
>
No, I don't think that really adds much in the way of complexity there.
I have that set working. Most of what's holding me up from posting the
next iteration of that set is performance. So far, my testing shows
that the workqueue-based code is slightly slower. I've been trying to
figure out why that is and whether I can do anything about it. Maybe
I'll go ahead and post it as a second RFC set, until I can get to the
bottom of the perf delta.
I have pondered doing what you're suggesting above though and it's not a
trivial change.
The problem is that all of the buffers into which we do receives are
associated with the svc_rqst (which we don't really have when the
interrupt comes in), and not the svc_xprt (which we do have at that
point).
So, you'd need to restructure the code to hang a receive buffer off
of the svc_xprt. Once you receive an entire RPC, you'd then have to
flip that buffer over to a svc_rqst, queue up the job and grab a new
buffer for the xprt (maybe you could swap them?).
The problem is what to do if you don't have a buffer (or svc_rqst)
available when an IRQ comes in. You can't allocate one from softirq
context, so you'd need to offload that case to a workqueue or something
anyway (which adds a bit of complexity as you'd then have to deal with
two different receive paths).
I'm also not sure about RDMA. When you get an RPC, the server usually
has to do an RDMA READ from the client to pull all of the data in. I
don't think you want to do that from softirq context, so that would
also need to be queued up somehow.
All of that said, it would probably reduce some context switching if
we can make that work. Also, I suspect that doing that in the context
of the workqueue-based code would probably be at least a little simpler.
--
Jeff Layton <[email protected]>
On Mon, Dec 08, 2014 at 02:54:29PM -0500, Jeff Layton wrote:
> On Mon, 8 Dec 2014 13:57:31 -0500
> "J. Bruce Fields" <[email protected]> wrote:
>
> > On Tue, Dec 02, 2014 at 11:50:24AM -0500, J. Bruce Fields wrote:
> > > On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> > > > On Tue, 2 Dec 2014 06:57:50 -0500
> > > > Jeff Layton <[email protected]> wrote:
> > > >
> > > > > On Mon, 1 Dec 2014 19:38:19 -0500
> > > > > Trond Myklebust <[email protected]> wrote:
> > > > >
> > > > > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > > > > I find it hard to think about how we expect this to affect performance.
> > > > > > > So it comes down to the observed results, I guess, but just trying to
> > > > > > > get an idea:
> > > > > > >
> > > > > > > - this eliminates sp_lock. I think the original idea here was
> > > > > > > that if interrupts could be routed correctly then there
> > > > > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > > > > we understand why that didn't pan out? Is hardware capable of
> > > > > > > doing this really rare, or is it just too hard to configure it
> > > > > > > correctly?
> > > > > >
> > > > > > One problem is that a 1MB incoming write will generate a lot of
> > > > > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > > > > on a 40GigE network. The other thing you should note is that this
> > > > > > workload was generated with ~100 clients pounding on that server, so
> > > > > > there are a fair amount of TCP connections to service in parallel.
> > > > > > Playing with the interrupt routing doesn't necessarily help you so
> > > > > > much when all those connections are hot.
> > > > > >
> > > >
> > > > In principle though, the percpu pool_mode should have alleviated the
> > > > contention on the sp_lock. When an interrupt comes in, the xprt gets
> > > > queued to its pool. If there is a pool for each cpu then there should
> > > > be no sp_lock contention. The pernode pool mode might also have
> > > > alleviated the lock contention to a lesser degree in a NUMA
> > > > configuration.
> > > >
> > > > Do we understand why that didn't help?
> > >
> > > Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
> > > not entirely orthogonal problem.
> > >
> > > (And I thought it should be addressable separately; Trond and I talked
> > > about this in Westford. I think it currently wakes a thread to handle
> > > each individual tcp segment--but shouldn't it be able to do all the data
> > > copying in the interrupt and wait to wake up a thread until it's got the
> > > entire rpc?)
> >
> > By the way, Jeff, isn't this part of what's complicating the workqueue
> > change? That would seem simpler if we didn't need to queue work until
> > we had the full rpc.
> >
>
> No, I don't think that really adds much in the way of complexity there.
>
> I have that set working. Most of what's holding me up from posting the
> next iteration of that set is performance. So far, my testing shows
> that the workqueue-based code is slightly slower. I've been trying to
> figure out why that is and whether I can do anything about it. Maybe
> I'll go ahead and post it as a second RFC set, until I can get to the
> bottom of the perf delta.
>
> I have pondered doing what you're suggesting above though and it's not a
> trivial change.
>
> The problem is that all of the buffers into which we do receives are
> associated with the svc_rqst (which we don't really have when the
> interrupt comes in), and not the svc_xprt (which we do have at that
> point).
>
> So, you'd need to restructure the code to hang a receive buffer off
> of the svc_xprt.
Have you looked at svsk->sk_pages and svc_tcp_{save,restore}_pages?
--b.
> Once you receive an entire RPC, you'd then have to
> flip that buffer over to a svc_rqst, queue up the job and grab a new
> buffer for the xprt (maybe you could swap them?).
>
> The problem is what to do if you don't have a buffer (or svc_rqst)
> available when an IRQ comes in. You can't allocate one from softirq
> context, so you'd need to offload that case to a workqueue or something
> anyway (which adds a bit of complexity as you'd then have to deal with
> two different receive paths).
>
> I'm also not sure about RDMA. When you get an RPC, the server usually
> has to do an RDMA READ from the client to pull all of the data in. I
> don't think you want to do that from softirq context, so that would
> also need to be queued up somehow.
>
> All of that said, it would probably reduce some context switching if
> we can make that work. Also, I suspect that doing that in the context
> of the workqueue-based code would probably be at least a little simpler.
>
> --
> Jeff Layton <[email protected]>
On Mon, 8 Dec 2014 14:58:55 -0500
"J. Bruce Fields" <[email protected]> wrote:
> On Mon, Dec 08, 2014 at 02:54:29PM -0500, Jeff Layton wrote:
> > On Mon, 8 Dec 2014 13:57:31 -0500
> > "J. Bruce Fields" <[email protected]> wrote:
> >
> > > On Tue, Dec 02, 2014 at 11:50:24AM -0500, J. Bruce Fields wrote:
> > > > On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> > > > > On Tue, 2 Dec 2014 06:57:50 -0500
> > > > > Jeff Layton <[email protected]> wrote:
> > > > >
> > > > > > On Mon, 1 Dec 2014 19:38:19 -0500
> > > > > > Trond Myklebust <[email protected]> wrote:
> > > > > >
> > > > > > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > > > > > I find it hard to think about how we expect this to affect performance.
> > > > > > > > So it comes down to the observed results, I guess, but just trying to
> > > > > > > > get an idea:
> > > > > > > >
> > > > > > > > - this eliminates sp_lock. I think the original idea here was
> > > > > > > > that if interrupts could be routed correctly then there
> > > > > > > > shouldn't normally be cross-cpu contention on this lock. Do
> > > > > > > > we understand why that didn't pan out? Is hardware capable of
> > > > > > > > doing this really rare, or is it just too hard to configure it
> > > > > > > > correctly?
> > > > > > >
> > > > > > > One problem is that a 1MB incoming write will generate a lot of
> > > > > > > interrupts. While that is not so noticeable on a 1GigE network, it is
> > > > > > > on a 40GigE network. The other thing you should note is that this
> > > > > > > workload was generated with ~100 clients pounding on that server, so
> > > > > > > there are a fair amount of TCP connections to service in parallel.
> > > > > > > Playing with the interrupt routing doesn't necessarily help you so
> > > > > > > much when all those connections are hot.
> > > > > > >
> > > > >
> > > > > In principle though, the percpu pool_mode should have alleviated the
> > > > > contention on the sp_lock. When an interrupt comes in, the xprt gets
> > > > > queued to its pool. If there is a pool for each cpu then there should
> > > > > be no sp_lock contention. The pernode pool mode might also have
> > > > > alleviated the lock contention to a lesser degree in a NUMA
> > > > > configuration.
> > > > >
> > > > > Do we understand why that didn't help?
> > > >
> > > > Yes, the lots-of-interrupts-per-rpc problem strikes me as a separate if
> > > > not entirely orthogonal problem.
> > > >
> > > > (And I thought it should be addressable separately; Trond and I talked
> > > > about this in Westford. I think it currently wakes a thread to handle
> > > > each individual tcp segment--but shouldn't it be able to do all the data
> > > > copying in the interrupt and wait to wake up a thread until it's got the
> > > > entire rpc?)
> > >
> > > By the way, Jeff, isn't this part of what's complicating the workqueue
> > > change? That would seem simpler if we didn't need to queue work until
> > > we had the full rpc.
> > >
> >
> > No, I don't think that really adds much in the way of complexity there.
> >
> > I have that set working. Most of what's holding me up from posting the
> > next iteration of that set is performance. So far, my testing shows
> > that the workqueue-based code is slightly slower. I've been trying to
> > figure out why that is and whether I can do anything about it. Maybe
> > I'll go ahead and post it as a second RFC set, until I can get to the
> > bottom of the perf delta.
> >
> > I have pondered doing what you're suggesting above though and it's not a
> > trivial change.
> >
> > The problem is that all of the buffers into which we do receives are
> > associated with the svc_rqst (which we don't really have when the
> > interrupt comes in), and not the svc_xprt (which we do have at that
> > point).
> >
> > So, you'd need to restructure the code to hang a receive buffer off
> > of the svc_xprt.
>
> Have you looked at svsk->sk_pages and svc_tcp_{save,restore}_pages?
>
> --b.
>
Ahh, no I hadn't...interesting.
So, basically do the receive into the rqstp's buffer, and if you
don't get everything you need you stuff the pages into the sk_pages
array to await the next pass. Weird design...
Ok, so you could potentially flip that around. Do the receive into the
sk_pages buffer in softirq context, and then hand those off to the rqst
(in some fashion) once you've received a full RPC.
You'd have to work out how to replenish the sk_pages after each
receive, and what to do about RDMA, but it's probably doable.
> > Once you receive an entire RPC, you'd then have to
> > flip that buffer over to a svc_rqst, queue up the job and grab a new
> > buffer for the xprt (maybe you could swap them?).
> >
> > The problem is what to do if you don't have a buffer (or svc_rqst)
> > available when an IRQ comes in. You can't allocate one from softirq
> > context, so you'd need to offload that case to a workqueue or something
> > anyway (which adds a bit of complexity as you'd then have to deal with
> > two different receive paths).
> >
> > I'm also not sure about RDMA. When you get an RPC, the server usually
> > has to do an RDMA READ from the client to pull all of the data in. I
> > don't think you want to do that from softirq context, so that would
> > also need to be queued up somehow.
> >
> > All of that said, it would probably reduce some context switching if
> > we can make that work. Also, I suspect that doing that in the context
> > of the workqueue-based code would probably be at least a little simpler.
> >
> > --
> > Jeff Layton <[email protected]>
--
Jeff Layton <[email protected]>
On Tue, Dec 02, 2014 at 07:14:22AM -0500, Jeff Layton wrote:
> On Tue, 2 Dec 2014 06:57:50 -0500
> Jeff Layton <[email protected]> wrote:
>
> > On Mon, 1 Dec 2014 19:38:19 -0500
> > Trond Myklebust <[email protected]> wrote:
> >
> > > On Mon, Dec 1, 2014 at 6:47 PM, J. Bruce Fields <[email protected]> wrote:
> > > > - instead we're walking the list of all threads looking for an
> > > > idle one. I suppose that's tpyically not more than a few
> > > > hundred. Does this being fast depend on the fact that that
> > > > list is almost never changed? Should we be rearranging
> > > > svc_rqst so frequently-written fields aren't nearby?
> > >
> > > Given a 64-byte cache line, that is 8 pointers worth on a 64-bit processor.
> > >
> > > - rq_all, rq_server, rq_pool, rq_task don't ever change, so perhaps
> > > shove them together into the same cacheline?
> > >
> > > - rq_xprt does get set often until we have a full RPC request worth of
> > > data, so perhaps consider moving that.
> > >
> > > - OTOH, rq_addr, rq_addrlen, rq_daddr, rq_daddrlen are only set once
> > > we have a full RPC to process, and then keep their values until that
> > > RPC call is finished. That doesn't look too bad.
By the way, one thing I forgot when writing the above comment was that
the list we're walking (sp_all_threads) is *still* per-pool (for some
reason I was thinking it was global), so it's really unlikely we're
making things worse here.
Still, reshuffling those svc_rqst fields is easy and might help.
I think your tests probably aren't hitting the worst case here, either:
even in a read-mostly case most interrupts will be handling the (less
frequent but larger) writes. Maybe an all-stat workload would test the
case where e.g. rq_xprt is written to every time? But I haven't thought
that through.
--b.
On Fri, 21 Nov 2014 14:19:31 -0500
Jeff Layton <[email protected]> wrote:
> These were useful when I was tracking down a race condition between
> svc_xprt_do_enqueue and svc_get_next_xprt.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> include/trace/events/sunrpc.h | 94 +++++++++++++++++++++++++++++++++++++++++++
> net/sunrpc/svc_xprt.c | 23 +++++++----
> 2 files changed, 110 insertions(+), 7 deletions(-)
>
> diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
> index ee4438a63a48..b9c1dc6c825a 100644
> --- a/include/trace/events/sunrpc.h
> +++ b/include/trace/events/sunrpc.h
> @@ -8,6 +8,7 @@
> #include <linux/sunrpc/clnt.h>
> #include <linux/sunrpc/svc.h>
> #include <linux/sunrpc/xprtsock.h>
> +#include <linux/sunrpc/svc_xprt.h>
> #include <net/tcp_states.h>
> #include <linux/net.h>
> #include <linux/tracepoint.h>
> @@ -480,6 +481,99 @@ DEFINE_EVENT(svc_rqst_status, svc_send,
> TP_PROTO(struct svc_rqst *rqst, int status),
> TP_ARGS(rqst, status));
>
> +#define show_svc_xprt_flags(flags) \
> + __print_flags(flags, "|", \
> + { (1UL << XPT_BUSY), "XPT_BUSY"}, \
> + { (1UL << XPT_CONN), "XPT_CONN"}, \
> + { (1UL << XPT_CLOSE), "XPT_CLOSE"}, \
> + { (1UL << XPT_DATA), "XPT_DATA"}, \
> + { (1UL << XPT_TEMP), "XPT_TEMP"}, \
> + { (1UL << XPT_DEAD), "XPT_DEAD"}, \
> + { (1UL << XPT_CHNGBUF), "XPT_CHNGBUF"}, \
> + { (1UL << XPT_DEFERRED), "XPT_DEFERRED"}, \
> + { (1UL << XPT_OLD), "XPT_OLD"}, \
> + { (1UL << XPT_LISTENER), "XPT_LISTENER"}, \
> + { (1UL << XPT_CACHE_AUTH), "XPT_CACHE_AUTH"}, \
> + { (1UL << XPT_LOCAL), "XPT_LOCAL"})
> +
> +TRACE_EVENT(svc_xprt_do_enqueue,
> + TP_PROTO(struct svc_xprt *xprt, struct svc_rqst *rqst),
> +
> + TP_ARGS(xprt, rqst),
> +
> + TP_STRUCT__entry(
> + __field(struct svc_xprt *, xprt)
> + __field(struct svc_rqst *, rqst)
> + ),
> +
> + TP_fast_assign(
> + __entry->xprt = xprt;
> + __entry->rqst = rqst;
> + ),
> +
> + TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt,
> + (struct sockaddr *)&__entry->xprt->xpt_remote,
> + __entry->rqst ? __entry->rqst->rq_task->pid : 0,
> + show_svc_xprt_flags(__entry->xprt->xpt_flags))
> +);
> +
> +TRACE_EVENT(svc_xprt_dequeue,
> + TP_PROTO(struct svc_xprt *xprt),
> +
> + TP_ARGS(xprt),
> +
> + TP_STRUCT__entry(
> + __field(struct svc_xprt *, xprt)
> + __field_struct(struct sockaddr_storage, ss)
> + __field(unsigned long, flags)
> + ),
> +
> + TP_fast_assign(
> + __entry->xprt = xprt,
> + xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss));
> + __entry->flags = xprt ? xprt->xpt_flags : 0;
> + ),
> +
> + TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt,
> + (struct sockaddr *)&__entry->ss,
> + show_svc_xprt_flags(__entry->flags))
> +);
> +
> +TRACE_EVENT(svc_wake_up,
> + TP_PROTO(int pid),
> +
> + TP_ARGS(pid),
> +
> + TP_STRUCT__entry(
> + __field(int, pid)
> + ),
> +
> + TP_fast_assign(
> + __entry->pid = pid;
> + ),
> +
> + TP_printk("pid=%d", __entry->pid)
> +);
> +
> +TRACE_EVENT(svc_handle_xprt,
> + TP_PROTO(struct svc_xprt *xprt, int len),
> +
> + TP_ARGS(xprt, len),
> +
> + TP_STRUCT__entry(
> + __field(struct svc_xprt *, xprt)
> + __field(int, len)
> + ),
> +
> + TP_fast_assign(
> + __entry->xprt = xprt;
> + __entry->len = len;
> + ),
> +
> + TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt,
> + (struct sockaddr *)&__entry->xprt->xpt_remote, __entry->len,
> + show_svc_xprt_flags(__entry->xprt->xpt_flags))
> +);
> #endif /* _TRACE_SUNRPC_H */
>
> #include <trace/define_trace.h>
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index ed90d955f733..0ae1d78d934d 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -322,12 +322,12 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
> static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> {
> struct svc_pool *pool;
> - struct svc_rqst *rqstp;
> + struct svc_rqst *rqstp = NULL;
> int cpu;
> bool queued = false;
>
> if (!svc_xprt_has_something_to_do(xprt))
> - return;
> + goto out;
>
> /* Mark transport as busy. It will remain in this state until
> * the provider calls svc_xprt_received. We update XPT_BUSY
> @@ -337,7 +337,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
> /* Don't enqueue transport while already enqueued */
> dprintk("svc: transport %p busy, not enqueued\n", xprt);
> - return;
> + goto out;
> }
>
> cpu = get_cpu();
> @@ -377,7 +377,7 @@ redo_search:
> atomic_long_inc(&pool->sp_stats.threads_woken);
> wake_up_process(rqstp->rq_task);
> put_cpu();
> - return;
> + goto out;
> }
> rcu_read_unlock();
>
> @@ -387,6 +387,7 @@ redo_search:
> * up to it directly but we can wake the thread up in the hopes that it
> * will pick it up once it searches for a xprt to service.
> */
> + dprintk("svc: transport %p put into queue\n", xprt);
Not sure how I ended up adding this dprintk here. That can certainly be
removed since it's a duplicate of the one inside the following
conditional block and we don't really want that to fire but once.
Bruce, do you want me to resend this patch with that removed?
> if (!queued) {
> queued = true;
> dprintk("svc: transport %p put into queue\n", xprt);
> @@ -396,7 +397,10 @@ redo_search:
> spin_unlock_bh(&pool->sp_lock);
> goto redo_search;
> }
> + rqstp = NULL;
> put_cpu();
> +out:
> + trace_svc_xprt_do_enqueue(xprt, rqstp);
> }
>
> /*
> @@ -420,7 +424,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
> struct svc_xprt *xprt = NULL;
>
> if (list_empty(&pool->sp_sockets))
> - return NULL;
> + goto out;
>
> spin_lock_bh(&pool->sp_lock);
> if (likely(!list_empty(&pool->sp_sockets))) {
> @@ -433,7 +437,8 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
> xprt, atomic_read(&xprt->xpt_ref.refcount));
> }
> spin_unlock_bh(&pool->sp_lock);
> -
> +out:
> + trace_svc_xprt_dequeue(xprt);
> return xprt;
> }
>
> @@ -515,6 +520,7 @@ void svc_wake_up(struct svc_serv *serv)
> rcu_read_unlock();
> dprintk("svc: daemon %p woken up.\n", rqstp);
> wake_up_process(rqstp->rq_task);
> + trace_svc_wake_up(rqstp->rq_task->pid);
> return;
> }
> rcu_read_unlock();
> @@ -522,6 +528,7 @@ void svc_wake_up(struct svc_serv *serv)
> /* No free entries available */
> set_bit(SP_TASK_PENDING, &pool->sp_flags);
> smp_wmb();
> + trace_svc_wake_up(0);
> }
> EXPORT_SYMBOL_GPL(svc_wake_up);
>
> @@ -740,7 +747,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
> dprintk("svc_recv: found XPT_CLOSE\n");
> svc_delete_xprt(xprt);
> /* Leave XPT_BUSY set on the dead xprt: */
> - return 0;
> + goto out;
> }
> if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
> struct svc_xprt *newxpt;
> @@ -771,6 +778,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
> }
> /* clear XPT_BUSY: */
> svc_xprt_received(xprt);
> +out:
> + trace_svc_handle_xprt(xprt, len);
> return len;
> }
>
--
Jeff Layton <[email protected]>
On Tue, Dec 02, 2014 at 08:31:12AM -0500, Jeff Layton wrote:
> On Fri, 21 Nov 2014 14:19:31 -0500
> Jeff Layton <[email protected]> wrote:
>
> > These were useful when I was tracking down a race condition between
> > svc_xprt_do_enqueue and svc_get_next_xprt.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > include/trace/events/sunrpc.h | 94 +++++++++++++++++++++++++++++++++++++++++++
> > net/sunrpc/svc_xprt.c | 23 +++++++----
> > 2 files changed, 110 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
> > index ee4438a63a48..b9c1dc6c825a 100644
> > --- a/include/trace/events/sunrpc.h
> > +++ b/include/trace/events/sunrpc.h
> > @@ -8,6 +8,7 @@
> > #include <linux/sunrpc/clnt.h>
> > #include <linux/sunrpc/svc.h>
> > #include <linux/sunrpc/xprtsock.h>
> > +#include <linux/sunrpc/svc_xprt.h>
> > #include <net/tcp_states.h>
> > #include <linux/net.h>
> > #include <linux/tracepoint.h>
> > @@ -480,6 +481,99 @@ DEFINE_EVENT(svc_rqst_status, svc_send,
> > TP_PROTO(struct svc_rqst *rqst, int status),
> > TP_ARGS(rqst, status));
> >
> > +#define show_svc_xprt_flags(flags) \
> > + __print_flags(flags, "|", \
> > + { (1UL << XPT_BUSY), "XPT_BUSY"}, \
> > + { (1UL << XPT_CONN), "XPT_CONN"}, \
> > + { (1UL << XPT_CLOSE), "XPT_CLOSE"}, \
> > + { (1UL << XPT_DATA), "XPT_DATA"}, \
> > + { (1UL << XPT_TEMP), "XPT_TEMP"}, \
> > + { (1UL << XPT_DEAD), "XPT_DEAD"}, \
> > + { (1UL << XPT_CHNGBUF), "XPT_CHNGBUF"}, \
> > + { (1UL << XPT_DEFERRED), "XPT_DEFERRED"}, \
> > + { (1UL << XPT_OLD), "XPT_OLD"}, \
> > + { (1UL << XPT_LISTENER), "XPT_LISTENER"}, \
> > + { (1UL << XPT_CACHE_AUTH), "XPT_CACHE_AUTH"}, \
> > + { (1UL << XPT_LOCAL), "XPT_LOCAL"})
> > +
> > +TRACE_EVENT(svc_xprt_do_enqueue,
> > + TP_PROTO(struct svc_xprt *xprt, struct svc_rqst *rqst),
> > +
> > + TP_ARGS(xprt, rqst),
> > +
> > + TP_STRUCT__entry(
> > + __field(struct svc_xprt *, xprt)
> > + __field(struct svc_rqst *, rqst)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->xprt = xprt;
> > + __entry->rqst = rqst;
> > + ),
> > +
> > + TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt,
> > + (struct sockaddr *)&__entry->xprt->xpt_remote,
> > + __entry->rqst ? __entry->rqst->rq_task->pid : 0,
> > + show_svc_xprt_flags(__entry->xprt->xpt_flags))
> > +);
> > +
> > +TRACE_EVENT(svc_xprt_dequeue,
> > + TP_PROTO(struct svc_xprt *xprt),
> > +
> > + TP_ARGS(xprt),
> > +
> > + TP_STRUCT__entry(
> > + __field(struct svc_xprt *, xprt)
> > + __field_struct(struct sockaddr_storage, ss)
> > + __field(unsigned long, flags)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->xprt = xprt,
> > + xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss));
> > + __entry->flags = xprt ? xprt->xpt_flags : 0;
> > + ),
> > +
> > + TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt,
> > + (struct sockaddr *)&__entry->ss,
> > + show_svc_xprt_flags(__entry->flags))
> > +);
> > +
> > +TRACE_EVENT(svc_wake_up,
> > + TP_PROTO(int pid),
> > +
> > + TP_ARGS(pid),
> > +
> > + TP_STRUCT__entry(
> > + __field(int, pid)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->pid = pid;
> > + ),
> > +
> > + TP_printk("pid=%d", __entry->pid)
> > +);
> > +
> > +TRACE_EVENT(svc_handle_xprt,
> > + TP_PROTO(struct svc_xprt *xprt, int len),
> > +
> > + TP_ARGS(xprt, len),
> > +
> > + TP_STRUCT__entry(
> > + __field(struct svc_xprt *, xprt)
> > + __field(int, len)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->xprt = xprt;
> > + __entry->len = len;
> > + ),
> > +
> > + TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt,
> > + (struct sockaddr *)&__entry->xprt->xpt_remote, __entry->len,
> > + show_svc_xprt_flags(__entry->xprt->xpt_flags))
> > +);
> > #endif /* _TRACE_SUNRPC_H */
> >
> > #include <trace/define_trace.h>
> > diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> > index ed90d955f733..0ae1d78d934d 100644
> > --- a/net/sunrpc/svc_xprt.c
> > +++ b/net/sunrpc/svc_xprt.c
> > @@ -322,12 +322,12 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
> > static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> > {
> > struct svc_pool *pool;
> > - struct svc_rqst *rqstp;
> > + struct svc_rqst *rqstp = NULL;
> > int cpu;
> > bool queued = false;
> >
> > if (!svc_xprt_has_something_to_do(xprt))
> > - return;
> > + goto out;
> >
> > /* Mark transport as busy. It will remain in this state until
> > * the provider calls svc_xprt_received. We update XPT_BUSY
> > @@ -337,7 +337,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> > if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
> > /* Don't enqueue transport while already enqueued */
> > dprintk("svc: transport %p busy, not enqueued\n", xprt);
> > - return;
> > + goto out;
> > }
> >
> > cpu = get_cpu();
> > @@ -377,7 +377,7 @@ redo_search:
> > atomic_long_inc(&pool->sp_stats.threads_woken);
> > wake_up_process(rqstp->rq_task);
> > put_cpu();
> > - return;
> > + goto out;
> > }
> > rcu_read_unlock();
> >
> > @@ -387,6 +387,7 @@ redo_search:
> > * up to it directly but we can wake the thread up in the hopes that it
> > * will pick it up once it searches for a xprt to service.
> > */
> > + dprintk("svc: transport %p put into queue\n", xprt);
>
> Not sure how I ended up adding this dprintk here. That can certainly be
> removed since it's a duplicate of the one inside the following
> conditional block and we don't really want that to fire but once.
>
> Bruce, do you want me to resend this patch with that removed?
I've fixed it, thanks for warning me.
--b.
On Fri, Nov 21, 2014 at 02:19:27PM -0500, Jeff Layton wrote:
> Here are the patches that I had mentioned earlier that reduce the
> contention for the pool->sp_lock when the server is heavily loaded.
>
> The basic problem is that whenever a svc_xprt needs to be queued up for
> servicing, we have to take the pool->sp_lock to try and find an idle
> thread to service it. On a busy server, that lock becomes highly
> contended and that limits the throughput.
>
> This patchset fixes this by changing how we search for an idle thread.
> First, we convert svc_rqst and the sp_all_threads list to be
> RCU-managed. Then we change the search for an idle thread to use the
> sp_all_threads list, which now can be done under the rcu_read_lock.
> When there is an available thread, queueing an xprt to it can now be
> done without any spinlocking.
>
> With this, we see a pretty substantial increase in performance on a
> larger-scale server that is heavily loaded. Chris has some preliminary
> numbers, but they need to be cleaned up a bit before we can present
> them. I'm hoping to have those by early next week.
I have these merged on top of Trond's nfs-for-3.19-1 tag. The result is
at git://linux-nfs.org/~bfields/linux-topics.git for-3.19-incoming if
you want to check that I haven't screwed anything up.
I'll probably push that to for-3.19 once it's passed a little more
testing, and send a pull request once I've cleared my mailbox backlog
and seen Trond's stuff merged.
--b.